In [None]:
%pip install fastparquet
%pip install lightgbm

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet("../data/train_final_dataset.snappy.parquet", engine="fastparquet")

In [3]:
df['image_similarity'] = df['image_similarity'].fillna(0)

df[['is_same_location', 'is_same_region']] = df[['is_same_location', 'is_same_region']].astype(int)

In [4]:
# Совпадение категорий и параметров
df['same_category'] = (df['base_category_name'] == df['cand_category_name']).astype(int)
df['same_subcategory'] = (df['base_subcategory_name'] == df['cand_subcategory_name']).astype(int)

In [5]:
df['price_diff_pct'] = 2 * abs(df['base_price'] - df['cand_price']) / (df['base_price'] + df['cand_price'] + 1e-6)

df['images_diff'] = df['base_count_images'] - df['cand_count_images']

In [6]:
numeric_cols = [
    'base_price', 
    'cand_price',
    'price_diff_pct',
    'base_count_images',
    'cand_count_images',
    'images_diff',
    'common_params_count',
    'same_values_count'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [7]:
df['base_param1'] = df['base_param1'].replace('', 'Не указано')
df['cand_param1'] = df['cand_param1'].replace('', 'Не указано')
df['same_param1'] = (df['base_param1'] == df['cand_param1']).astype(int)
le_param1 = LabelEncoder()
all_values = pd.concat([df['base_param1'], df['cand_param1']]).unique()
le_param1.fit(all_values)

df['base_param1_encoded'] = le_param1.transform(df['base_param1'])
df['cand_param1_encoded'] = le_param1.transform(df['cand_param1'])


In [8]:
df['base_param2'] = df['base_param2'].replace('', 'Не указано')
df['cand_param2'] = df['cand_param2'].replace('', 'Не указано')
df['same_param2'] = (df['base_param2'] == df['cand_param2']).astype(int)
le_param2 = LabelEncoder()
all_values = pd.concat([df['base_param2'], df['cand_param2']]).unique()
le_param2.fit(all_values)

df['base_param2_encoded'] = le_param2.transform(df['base_param2'])
df['cand_param2_encoded'] = le_param2.transform(df['cand_param2'])

In [9]:
columns_to_drop = [
    'base_item_id', 'cand_item_id', 'group_id', 'action_date',
    'base_title', 'cand_title', 'base_description', 'cand_description',
    'base_json_params', 'cand_json_params',
    'base_title_image', 'cand_title_image',
    'base_category_name', 'cand_category_name',
    'base_subcategory_name', 'cand_subcategory_name',
    'base_param1', 'cand_param1',
    'base_param2', 'cand_param2',
]

df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 0 to 499999
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   base_price              500000 non-null  float64
 1   cand_price              500000 non-null  float64
 2   base_count_images       500000 non-null  float64
 3   cand_count_images       500000 non-null  float64
 4   is_same_location        500000 non-null  int32  
 5   is_same_region          500000 non-null  int32  
 6   is_double               500000 non-null  int64  
 7   common_params_count     500000 non-null  float64
 8   same_values_count       500000 non-null  float64
 9   image_similarity        500000 non-null  float64
 10  basic_image_similarity  500000 non-null  float64
 11  title_similarity        500000 non-null  float64
 12  description_similarity  500000 non-null  float64
 13  same_category           500000 non-null  int32  
 14  same_subcategory        5

In [14]:
features = [
    'base_price', 'cand_price', 'price_diff_pct',
    'base_count_images', 'cand_count_images', 'images_diff',
    
    'title_similarity', 'description_similarity',
    'image_similarity', 'basic_image_similarity',
    
    'same_category', 'same_subcategory',
    'common_params_count', 'same_values_count',
    
    'is_same_location', 'is_same_region',

    'same_param1', 'base_param1_encoded', 'cand_param1_encoded',
    'same_param2', 'base_param2_encoded', 'cand_param2_encoded'
]

X = df[features]
y = df['is_double']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## Model training

In [15]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, average_precision_score, 
                            precision_recall_curve, f1_score, 
                            classification_report)
import lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import randint, uniform
import joblib

In [16]:
# параметр для LightGBM, учитывающий дисбаланс данных
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train) 

base_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'average_precision',
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1,
    'min_child_weight': 0.01,
    'min_split_gain': 0.0,
    'path_smooth': 0.1
}

param_dist = {
    'learning_rate': uniform(0.01, 0.1),
    'num_leaves': randint(20, 50),
    'max_depth': randint(3, 8),
    'min_child_samples': randint(30, 100),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'subsample': uniform(0.7, 0.3),  # от 0.6 до 1.0
    'colsample_bytree': uniform(0.7, 0.3),
    'feature_fraction': uniform(0.7, 0.3),
    'bagging_freq': randint(3, 10),
    'extra_trees': [True, False]
}

In [17]:
# Внешняя кросс-валидация для оценки модели
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Внутренняя кросс-валидация для подбора параметров
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Для хранения калиброванных предсказаний
test_preds = np.zeros(len(X_test))
oof_preds = np.zeros(len(X_train))
models = []
calibrators = []
thresholds = []

for fold, (train_idx, val_idx) in enumerate(cv_outer.split(X_train, y_train)):
    X_train_cross, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_cross, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Подбор параметров на внутренней кросс-валидации
    lgbm = lgb.LGBMClassifier(**base_params)
    search = RandomizedSearchCV(
        estimator=lgbm,
        param_distributions=param_dist,
        n_iter=30,
        scoring='average_precision',
        cv=cv_inner,
        verbose=0,
        random_state=42,
        n_jobs=-1
    )
    
    search.fit(X_train_cross, y_train_cross)
    best_params = search.best_params_
    final_params = {**base_params, **best_params}
    
    # Обучение модели с лучшими параметрами
    train_data = lgb.Dataset(X_train_cross, label=y_train_cross)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        final_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(100)
        ]
    )
    
    # Получение предсказаний для валидационного набора
    val_preds = model.predict(X_val)
    oof_preds[val_idx] = val_preds
    
    # Калибровка на валидационном наборе
    calibrator = IsotonicRegression(out_of_bounds='clip')
    calibrator.fit(val_preds, y_val)
    
    # Оптимизация порога по F1
    precisions, recalls, threshs = precision_recall_curve(y_val, calibrator.predict(val_preds))
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = threshs[optimal_idx]
    
    # Сохранение модели и калибратора
    models.append(model)
    calibrators.append(calibrator)
    thresholds.append(optimal_threshold)
    
    # Предсказание на тестовом наборе
    test_preds += model.predict(X_test) / cv_outer.get_n_splits()
    
    print(f"Fold {fold + 1} AP: {average_precision_score(y_val, val_preds):.4f}")
    print(f"Fold {fold + 1} Calibrated AP: {average_precision_score(y_val, calibrator.predict(val_preds)):.4f}")
    print(f"Fold {fold + 1} Optimal threshold: {optimal_threshold:.4f}")

[100]	train's average_precision: 0.601922	valid's average_precision: 0.584793
[200]	train's average_precision: 0.648811	valid's average_precision: 0.617029
[300]	train's average_precision: 0.680852	valid's average_precision: 0.632404
[400]	train's average_precision: 0.706711	valid's average_precision: 0.64239
[500]	train's average_precision: 0.730659	valid's average_precision: 0.651872
[600]	train's average_precision: 0.748827	valid's average_precision: 0.657182
[700]	train's average_precision: 0.76716	valid's average_precision: 0.662164
[800]	train's average_precision: 0.783674	valid's average_precision: 0.666262
[900]	train's average_precision: 0.799028	valid's average_precision: 0.669834
[1000]	train's average_precision: 0.812409	valid's average_precision: 0.672854
Fold 1 AP: 0.6729
Fold 1 Calibrated AP: 0.6642
Fold 1 Optimal threshold: 0.3696
[100]	train's average_precision: 0.606289	valid's average_precision: 0.569553
[200]	train's average_precision: 0.650959	valid's average_preci

In [18]:
# Калибровка на всех OOF предсказаниях (финальный калибратор)
final_calibrator = IsotonicRegression(out_of_bounds='clip')
final_calibrator.fit(oof_preds, y_train)

# Оптимизация порога на всех OOF предсказаниях
precisions, recalls, threshs = precision_recall_curve(y_train, final_calibrator.predict(oof_preds))
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
final_threshold = threshs[optimal_idx]

# Калибровка тестовых предсказаний
test_calibrated = final_calibrator.predict(test_preds)
test_pred = (test_calibrated >= final_threshold).astype(int)

# Оценка модели
print("\nFinal Evaluation:")
print(f"OOF MAP: {average_precision_score(y_train, oof_preds):.4f}")
print(f"OOF Calibrated MAP: {average_precision_score(y_train, final_calibrator.predict(oof_preds)):.4f}")
print(f"Test MAP: {average_precision_score(y_test, test_calibrated):.4f}")
print("\nClassification Report (at optimal threshold):")
print(classification_report(y_test, test_pred))
print("\nConfusion Matrix:")
print(pd.crosstab(y_test, test_pred, rownames=['Actual'], colnames=['Predicted']))


Final Evaluation:
OOF MAP: 0.6717
OOF Calibrated MAP: 0.6666
Test MAP: 0.6758

Classification Report (at optimal threshold):
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     93882
           1       0.67      0.59      0.63      6118

    accuracy                           0.96    100000
   macro avg       0.82      0.78      0.80    100000
weighted avg       0.95      0.96      0.96    100000


Confusion Matrix:
Predicted      0     1
Actual                
0          92095  1787
1           2519  3599


In [None]:
# Сохранение модели
model_assets = {
    'models': models,
    'final_calibrator': final_calibrator,
    'optimal_threshold': final_threshold,
    'features': features,
    'calibrators': calibrators,
    'params': final_params,
    'performance': {
        'test_map': average_precision_score(y_test, test_calibrated),
        'test_f1': f1_score(y_test, test_pred),
        'oof_map': average_precision_score(y_train, oof_preds),
        'oof_calibrated_map': average_precision_score(y_train, final_calibrator.predict(oof_preds))
    }
}

joblib.dump(model_assets, '../data/models/best_model_assets_2.joblib')
print("\nModel and all assets saved to 'best_model_assets_2.joblib'")


Model and all assets saved to 'best_model_assets_2.joblib'
