In [None]:
%pip install fastparquet
%pip install lightgbm

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet("../data/train_final_dataset.snappy.parquet", engine="fastparquet")

In [3]:
df['image_similarity'] = df['image_similarity'].fillna(0)

df[['is_same_location', 'is_same_region']] = df[['is_same_location', 'is_same_region']].astype(int)

In [4]:
# Совпадение категорий и параметров
df['same_category'] = (df['base_category_name'] == df['cand_category_name']).astype(int)
df['same_subcategory'] = (df['base_subcategory_name'] == df['cand_subcategory_name']).astype(int)

In [5]:
df['price_diff_pct'] = 2 * abs(df['base_price'] - df['cand_price']) / (df['base_price'] + df['cand_price'] + 1e-6)

df['images_diff'] = df['base_count_images'] - df['cand_count_images']

In [6]:
numeric_cols = [
    'base_price', 
    'cand_price',
    'price_diff_pct',
    'base_count_images',
    'cand_count_images',
    'images_diff',
    'common_params_count',
    'same_values_count'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [7]:
df['base_param1'] = df['base_param1'].replace('', 'Не указано')
df['cand_param1'] = df['cand_param1'].replace('', 'Не указано')
df['same_param1'] = (df['base_param1'] == df['cand_param1']).astype(int)
le_param1 = LabelEncoder()
all_values = pd.concat([df['base_param1'], df['cand_param1']]).unique()
le_param1.fit(all_values)

df['base_param1_encoded'] = le_param1.transform(df['base_param1'])
df['cand_param1_encoded'] = le_param1.transform(df['cand_param1'])


In [8]:
df['base_param2'] = df['base_param2'].replace('', 'Не указано')
df['cand_param2'] = df['cand_param2'].replace('', 'Не указано')
df['same_param2'] = (df['base_param2'] == df['cand_param2']).astype(int)
le_param2 = LabelEncoder()
all_values = pd.concat([df['base_param2'], df['cand_param2']]).unique()
le_param2.fit(all_values)

df['base_param2_encoded'] = le_param2.transform(df['base_param2'])
df['cand_param2_encoded'] = le_param2.transform(df['cand_param2'])

In [9]:
columns_to_drop = [
    'base_item_id', 'cand_item_id', 'group_id', 'action_date',
    'base_title', 'cand_title', 'base_description', 'cand_description',
    'base_json_params', 'cand_json_params',
    'base_title_image', 'cand_title_image',
    'base_category_name', 'cand_category_name',
    'base_subcategory_name', 'cand_subcategory_name',
    'base_param1', 'cand_param1',
    'base_param2', 'cand_param2',
]

df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 0 to 499999
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   base_price              500000 non-null  float64
 1   cand_price              500000 non-null  float64
 2   base_count_images       500000 non-null  float64
 3   cand_count_images       500000 non-null  float64
 4   is_same_location        500000 non-null  int32  
 5   is_same_region          500000 non-null  int32  
 6   is_double               500000 non-null  int64  
 7   common_params_count     500000 non-null  float64
 8   same_values_count       500000 non-null  float64
 9   image_similarity        500000 non-null  float64
 10  basic_image_similarity  500000 non-null  float64
 11  title_similarity        500000 non-null  float64
 12  description_similarity  500000 non-null  float64
 13  same_category           500000 non-null  int32  
 14  same_subcategory        5

In [10]:
features = [
    'base_price', 'cand_price', 'price_diff_pct',
    'base_count_images', 'cand_count_images', 'images_diff',
    
    'title_similarity', 'description_similarity',
    'image_similarity', 'basic_image_similarity',
    
    'same_category', 'same_subcategory',
    'common_params_count', 'same_values_count',
    
    'is_same_location', 'is_same_region',

    'same_param1', 'base_param1_encoded', 'cand_param1_encoded',
    'same_param2', 'base_param2_encoded', 'cand_param2_encoded'
]

X = df[features]
y = df['is_double']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## Model training

In [11]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, 
                            precision_recall_curve, f1_score, 
                            classification_report)
import lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import randint, uniform

In [13]:
# параметр для LightGBM, учитывающий дисбаланс данных
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train) 

base_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'average_precision',
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

param_dist = {
    'learning_rate': uniform(0.01, 0.3),
    'num_leaves': randint(20, 100),
    'max_depth': randint(3, 15),
    'min_child_samples': randint(10, 100),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'subsample': uniform(0.6, 0.4),  # от 0.6 до 1.0
    'colsample_bytree': uniform(0.6, 0.4),
    'feature_fraction': uniform(0.6, 0.4),
    'bagging_freq': randint(0, 10)
}

In [14]:
lgbm = lgb.LGBMClassifier(**base_params)

search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,  # Количество итераций
    scoring='average_precision',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    verbose=3,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [15]:
best_params = search.best_params_
print(f"Best parameters: {best_params}")
print(f"Best CV score: {search.best_score_:.4f}")

final_params = {**base_params, **best_params}
print(f"Final parameters: {final_params}")

Best parameters: {'bagging_freq': 6, 'colsample_bytree': 0.9932923543227152, 'feature_fraction': 0.786705157299192, 'learning_rate': 0.26798212202089616, 'max_depth': 9, 'min_child_samples': 30, 'num_leaves': 92, 'reg_alpha': 0.06505159298527952, 'reg_lambda': 0.9488855372533332, 'subsample': 0.9862528132298237}
Best CV score: 0.6418
Final parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'average_precision', 'scale_pos_weight': 15.345878795308733, 'random_state': 42, 'n_jobs': -1, 'verbose': -1, 'bagging_freq': 6, 'colsample_bytree': 0.9932923543227152, 'feature_fraction': 0.786705157299192, 'learning_rate': 0.26798212202089616, 'max_depth': 9, 'min_child_samples': 30, 'num_leaves': 92, 'reg_alpha': 0.06505159298527952, 'reg_lambda': 0.9488855372533332, 'subsample': 0.9862528132298237}


In [17]:
# Разделяем данные на обучение и валидацию
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# Создаем Dataset объекты
train_data = lgb.Dataset(X_train_final, label=y_train_final)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Добавляем метрику в параметры, если ее нет
if 'metric' not in final_params:
    final_params['metric'] = 'average_precision'  # Или другая метрика

# Финальное обучение с указанием валидационного набора
final_model = lgb.train(
    final_params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],  # Важно: передаем оба набора
    valid_names=['train', 'valid'],    # Опционально: имена для логов
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
)

Training until validation scores don't improve for 50 rounds
[100]	train's average_precision: 0.760479	valid's average_precision: 0.647638
[200]	train's average_precision: 0.862945	valid's average_precision: 0.670575
[300]	train's average_precision: 0.922391	valid's average_precision: 0.682205
[400]	train's average_precision: 0.955808	valid's average_precision: 0.68863
[500]	train's average_precision: 0.97459	valid's average_precision: 0.691626
[600]	train's average_precision: 0.985634	valid's average_precision: 0.693156
[700]	train's average_precision: 0.99248	valid's average_precision: 0.695283
Early stopping, best iteration is:
[746]	train's average_precision: 0.994102	valid's average_precision: 0.696522


In [23]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
    val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
    
    fold_model = lgb.train(
        final_params,
        train_data,
        num_boost_round=1000,
        valid_names=['train', 'valid'],
        valid_sets=[train_data, val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(100)
        ]
    )
    
    oof_preds[val_idx] = fold_model.predict(X_fold_val)
    fold_score = average_precision_score(y_fold_val, oof_preds[val_idx])
    print(f"Fold {fold + 1} AP: {fold_score:.4f}")

[100]	train's average_precision: 0.758569	valid's average_precision: 0.646538
[200]	train's average_precision: 0.85887	valid's average_precision: 0.66761
[300]	train's average_precision: 0.918736	valid's average_precision: 0.678024
[400]	train's average_precision: 0.953519	valid's average_precision: 0.682097
[500]	train's average_precision: 0.974602	valid's average_precision: 0.68844
[600]	train's average_precision: 0.985492	valid's average_precision: 0.69076
[700]	train's average_precision: 0.991577	valid's average_precision: 0.693771
Fold 1 AP: 0.6938
[100]	train's average_precision: 0.762773	valid's average_precision: 0.63267
[200]	train's average_precision: 0.860236	valid's average_precision: 0.650807
[300]	train's average_precision: 0.920036	valid's average_precision: 0.659814
[400]	train's average_precision: 0.954644	valid's average_precision: 0.668919
[500]	train's average_precision: 0.97474	valid's average_precision: 0.673425
[600]	train's average_precision: 0.985665	valid's av

In [24]:
# Калибровка вероятностей
calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(oof_preds, y_train)

In [25]:
# Оптимизация порога по F1 - учитываем и precision и recall
precisions, recalls, thresholds = precision_recall_curve(y_train, calibrator.predict(oof_preds))

f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold (F1-max): {optimal_threshold:.4f}")

Optimal threshold (F1-max): 0.3497


In [None]:
# Оценка всех моделей кросвалидации и усреднение
# test_preds = np.mean([model.predict(X_test) for model in models], axis=0)
test_preds = final_model.predict(X_test)
test_calibrated = calibrator.predict(test_preds)
test_pred = (test_calibrated >= optimal_threshold).astype(int)

In [27]:
# Финалные метрики
print("\nTest Evaluation across all folds:")
print(f"MAP: {average_precision_score(y_test, test_calibrated):.4f}")
print("\nClassification Report (at optimal threshold):")
print(classification_report(y_test, test_pred))
print("\nConfusion Matrix:")
print(pd.crosstab(y_test, test_pred, rownames=['Actual'], colnames=['Predicted']))


Test Evaluation across all folds:
MAP: 0.6916

Classification Report (at optimal threshold):
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     93882
           1       0.67      0.62      0.64      6118

    accuracy                           0.96    100000
   macro avg       0.82      0.80      0.81    100000
weighted avg       0.96      0.96      0.96    100000


Confusion Matrix:
Predicted      0     1
Actual                
0          92007  1875
1           2333  3785


In [28]:
import joblib

model_assets = {
    'model': final_model,
    'calibrator': calibrator,
    'optimal_threshold': optimal_threshold,
    'features': features,
    'params': final_params,
    'performance': {
        'test_map': average_precision_score(y_test, test_calibrated),
        'test_f1': f1_score(y_test, test_pred)
    }
}

joblib.dump(model_assets, 'best_model_assets.joblib')

print("\nModel and all assets saved to 'best_model_assets.joblib'")


Model and all assets saved to 'best_model_assets.joblib'


In [None]:
# # Feature Importance
# importance_df = pd.DataFrame({
#     'feature': X.columns,
#     'importance': np.mean([model.feature_importance() for model in models], axis=0)
# }).sort_values('importance', ascending=False)

# print("\nTop 10 Features:")
# print(importance_df.head(10))