In [1]:
%pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_parquet("../data/processed/train_final_dataset.snappy.parquet", engine="fastparquet")

In [5]:
df['image_similarity'] = df['image_similarity'].fillna(0)

df[['is_same_location', 'is_same_region']] = df[['is_same_location', 'is_same_region']].astype(int)

In [6]:
# Совпадение категорий и параметров
df['same_category'] = (df['base_category_name'] == df['cand_category_name']).astype(int)
df['same_subcategory'] = (df['base_subcategory_name'] == df['cand_subcategory_name']).astype(int)

In [7]:
df['price_diff_pct'] = 2 * abs(df['base_price'] - df['cand_price']) / (df['base_price'] + df['cand_price'] + 1e-6)

df['images_diff'] = df['base_count_images'] - df['cand_count_images']

In [8]:
numeric_cols = [
    'base_price', 
    'cand_price',
    'price_diff_pct',
    'base_count_images',
    'cand_count_images',
    'images_diff',
    'common_params_count',
    'same_values_count'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [9]:
df['base_param1'] = df['base_param1'].replace('', 'Не указано')
df['cand_param1'] = df['cand_param1'].replace('', 'Не указано')
df['same_param1'] = (df['base_param1'] == df['cand_param1']).astype(int)
le_param1 = LabelEncoder()
all_values = pd.concat([df['base_param1'], df['cand_param1']]).unique()
le_param1.fit(all_values)

df['base_param1_encoded'] = le_param1.transform(df['base_param1'])
df['cand_param1_encoded'] = le_param1.transform(df['cand_param1'])


In [10]:
df['base_param2'] = df['base_param2'].replace('', 'Не указано')
df['cand_param2'] = df['cand_param2'].replace('', 'Не указано')
df['same_param2'] = (df['base_param2'] == df['cand_param2']).astype(int)
le_param2 = LabelEncoder()
all_values = pd.concat([df['base_param2'], df['cand_param2']]).unique()
le_param2.fit(all_values)

df['base_param2_encoded'] = le_param2.transform(df['base_param2'])
df['cand_param2_encoded'] = le_param2.transform(df['cand_param2'])

In [11]:
columns_to_drop = [
    'base_item_id', 'cand_item_id', 'group_id', 'action_date',
    'base_title', 'cand_title', 'base_description', 'cand_description',
    'base_json_params', 'cand_json_params',
    'base_title_image', 'cand_title_image',
    'base_category_name', 'cand_category_name',
    'base_subcategory_name', 'cand_subcategory_name',
    'base_param1', 'cand_param1',
    'base_param2', 'cand_param2',
]

df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 0 to 499999
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   base_price              500000 non-null  float64
 1   cand_price              500000 non-null  float64
 2   base_count_images       500000 non-null  float64
 3   cand_count_images       500000 non-null  float64
 4   is_same_location        500000 non-null  int32  
 5   is_same_region          500000 non-null  int32  
 6   is_double               500000 non-null  int64  
 7   common_params_count     500000 non-null  float64
 8   same_values_count       500000 non-null  float64
 9   image_similarity        500000 non-null  float64
 10  basic_image_similarity  500000 non-null  float64
 11  title_similarity        500000 non-null  float64
 12  description_similarity  500000 non-null  float64
 13  same_category           500000 non-null  int32  
 14  same_subcategory        5

In [12]:
features = [
    'base_price', 'cand_price', 'price_diff_pct',
    'base_count_images', 'cand_count_images', 'images_diff',
    
    'title_similarity', 'description_similarity',
    'image_similarity', 'basic_image_similarity',
    
    'same_category', 'same_subcategory',
    'common_params_count', 'same_values_count',
    
    'is_same_location', 'is_same_region',

    'same_param1', 'base_param1_encoded', 'cand_param1_encoded',
    'same_param2', 'base_param2_encoded', 'cand_param2_encoded'
]

#балансировку нужно еще добавить

X = df[features]
y = df['is_double']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## Model training

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, 
                            precision_recall_curve, f1_score, 
                            classification_report)
import lightgbm as lgb
from sklearn.isotonic import IsotonicRegression

In [14]:
# параметр для LightGBM, учитывающий дисбаланс данных
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train) 

params = {
    'objective': 'binary', # тип таргета
    'boosting_type': 'gbdt', # cтандартный градиент бустинг - для последовательной тренировки
    'metric': 'average_precision', # как в сореве сказано
    'scale_pos_weight': scale_pos_weight, # добавляем веса положит классу
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1, # нет ограничения глубины дерева
    'min_child_samples': 20,
    # умная двойная регуляризация: 
    'reg_alpha': 0.1,  # L1-регуляризация
    'reg_lambda': 0.1, # L2-регуляризация
    'subsample': 0.8,  # Доля случайных строк для каждого дерева
    'colsample_bytree': 0.8, # Доля случайных колонок для каждого дерева
    'random_state': 42,
    'n_jobs': -1, # использовать все cpu
    'verbose': -1
}


In [16]:
# базовая 5 кратная кросвалидация
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
models = []
oof_preds = np.zeros(len(X_train)) # для калибровки вероятностей класса (на нем изучается IsotonicRegression)

for train_idx, val_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
    val_data = lgb.Dataset(X_fold_val, label=y_fold_val)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),  # Правильный способ
            lgb.log_evaluation(100)  # Логи каждые 100 итераций
        ]
    )
    
    # Predict on validation set
    y_pred_prob = model.predict(X_fold_val)
    oof_preds[val_idx] = y_pred_prob # собираем честные предикты модели
    ap_score = average_precision_score(y_fold_val, y_pred_prob) # смотрим по average precision
    cv_scores.append(ap_score)
    models.append(model)

print(f"CV MAP scores: {cv_scores}")
print(f"Mean CV MAP: {np.mean(cv_scores):.4f}")

Training until validation scores don't improve for 50 rounds
[100]	valid_0's average_precision: 0.554398
[200]	valid_0's average_precision: 0.595199
[300]	valid_0's average_precision: 0.611123
[400]	valid_0's average_precision: 0.623362
[500]	valid_0's average_precision: 0.633141
[600]	valid_0's average_precision: 0.638804
[700]	valid_0's average_precision: 0.644114
[800]	valid_0's average_precision: 0.649036
[900]	valid_0's average_precision: 0.653731
[1000]	valid_0's average_precision: 0.657551
Did not meet early stopping. Best iteration is:
[1000]	valid_0's average_precision: 0.657551
Training until validation scores don't improve for 50 rounds
[100]	valid_0's average_precision: 0.545471
[200]	valid_0's average_precision: 0.581776
[300]	valid_0's average_precision: 0.59807
[400]	valid_0's average_precision: 0.608685
[500]	valid_0's average_precision: 0.617177
[600]	valid_0's average_precision: 0.626892
[700]	valid_0's average_precision: 0.632354
[800]	valid_0's average_precision: 0.

In [17]:
# Калибровка вероятностей
calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(oof_preds, y_train)

In [18]:
# Оптимизация порога по F1 - учитываем и precision и recall
precisions, recalls, thresholds = precision_recall_curve(y_train, calibrator.predict(oof_preds))

f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold (F1-max): {optimal_threshold:.4f}")

Optimal threshold (F1-max): 0.3090


In [19]:
# Оценка всех моделей кросвалидации и усреднение
test_preds = np.mean([model.predict(X_test) for model in models], axis=0)
test_calibrated = calibrator.predict(test_preds)
test_pred = (test_calibrated >= optimal_threshold).astype(int)

In [20]:
# Финалные метрики
print("\nTest Evaluation across all folds:")
print(f"MAP: {average_precision_score(y_test, test_calibrated):.4f}")
print("\nClassification Report (at optimal threshold):")
print(classification_report(y_test, test_pred))
print("\nConfusion Matrix:")
print(pd.crosstab(y_test, test_pred, rownames=['Actual'], colnames=['Predicted']))


Test Evaluation across all folds:
MAP: 0.6528

Classification Report (at optimal threshold):
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     93882
           1       0.63      0.58      0.61      6118

    accuracy                           0.95    100000
   macro avg       0.80      0.78      0.79    100000
weighted avg       0.95      0.95      0.95    100000


Confusion Matrix:
Predicted      0     1
Actual                
0          91833  2049
1           2565  3553


In [21]:
# Feature Importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': np.mean([model.feature_importance() for model in models], axis=0)
}).sort_values('importance', ascending=False)

print("\nTop 10 Features:")
print(importance_df.head(10))


Top 10 Features:
                   feature  importance
7   description_similarity      3312.4
0               base_price      2826.8
1               cand_price      2652.0
6         title_similarity      2270.6
17     base_param1_encoded      2091.6
2           price_diff_pct      2028.0
18     cand_param1_encoded      1954.8
20     base_param2_encoded      1778.4
21     cand_param2_encoded      1627.8
8         image_similarity      1618.2
