In [2]:
%reload_ext autoreload
%autoreload 2

In [1]:
%cd ..

/home/den/dev/git/ozon-e-cup-2025


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import sklearn
from sklearn.ensemble import (
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from tqdm.auto import tqdm

from src.optuna_utils import print_study_results
from src.preprocessing import CATEGORIAL_PREP_PIPELINE, NUMERIC_PREP_PIPELINE
from src.splits import create_final_splits, create_train_splits

In [4]:
train = pd.read_csv("data/1__train_with_flags.csv")
test = pd.read_csv("data/1__test_with_flags.csv")

# Preparation

In [17]:
train_splits = create_train_splits(data=train, n_splits=5)
final_splits = create_final_splits(data=train, n_splits=2)

In [18]:
for i, j in train_splits:
    total = len(i) + len(j)
    print(
        i.shape, j.shape,
        f"({len(i)/total:.2%} train, {len(j)/total:.2%} val)"
    )

(179662,) (17536,) (91.11% train, 8.89% val)
(50129,) (147069,) (25.42% train, 74.58% val)
(197192,) (6,) (100.00% train, 0.00% val)
(178530,) (18668,) (90.53% train, 9.47% val)
(183279,) (13919,) (92.94% train, 7.06% val)


In [16]:
for i, j in final_splits:
    total = len(i) + len(j)
    print(
        i.shape, j.shape,
        f"({len(i)/total:.2%} train, {len(j)/total:.2%} val)"
    )

(157758,) (39440,) (80.00% train, 20.00% val)
(157758,) (39440,) (80.00% train, 20.00% val)


In [11]:
train["SellerID"] = train["SellerID"].astype("string")
test["SellerID"] = test["SellerID"].astype("string")

In [12]:
train.columns

Index(['id', 'resolution', 'brand_name', 'description', 'name_rus',
       'CommercialTypeName4', 'rating_1_count', 'rating_2_count',
       'rating_3_count', 'rating_4_count', 'rating_5_count',
       'comments_published_count', 'photos_published_count',
       'videos_published_count', 'PriceDiscounted', 'item_time_alive',
       'item_count_fake_returns7', 'item_count_fake_returns30',
       'item_count_fake_returns90', 'item_count_sales7', 'item_count_sales30',
       'item_count_sales90', 'item_count_returns7', 'item_count_returns30',
       'item_count_returns90', 'GmvTotal7', 'GmvTotal30', 'GmvTotal90',
       'ExemplarAcceptedCountTotal7', 'ExemplarAcceptedCountTotal30',
       'ExemplarAcceptedCountTotal90', 'OrderAcceptedCountTotal7',
       'OrderAcceptedCountTotal30', 'OrderAcceptedCountTotal90',
       'ExemplarReturnedCountTotal7', 'ExemplarReturnedCountTotal30',
       'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal7',
       'ExemplarReturnedValueTotal30', '

In [13]:
cols_to_drop = ["id", "description", "name_rus", "ItemID"]
train = train.drop(columns=cols_to_drop)

test_ids = test["id"]  # нужно для сабмита
test = test.drop(columns=cols_to_drop)

In [14]:
prep_pipe = Pipeline(
    steps=[
        ("num", NUMERIC_PREP_PIPELINE),
        ("cat", CATEGORIAL_PREP_PIPELINE),
    ],
    verbose=False,
)

In [None]:
# Для ускорения - заранее применим prep_pipe ко всем сплитам
prep_splits = []
for train_ids, val_ids in tqdm(train_splits):
    X_train, y_train = train.loc[train_ids].drop("resolution", axis=1), train["resolution"].loc[train_ids]
    X_val, y_val = train.loc[val_ids].drop("resolution", axis=1), train["resolution"].loc[val_ids]

    prep_pipe.fit(X_train)

    X_train = prep_pipe.transform(X_train, is_train=True)
    X_val = prep_pipe.transform(X_val, is_train=False)

    prep_splits.append(((X_train, y_train), (X_val, y_val)))

## LogisticRegression

In [None]:
def objective(trial):
    scores = []
    for (X_train, y_train), (X_val, y_val) in tqdm(prep_splits, leave=False, desc="Cross-validation"):

        model = LogisticRegression(
            penalty="l2",
            C=trial.suggest_float("C", 1e-4, 1e4, log=True),
            class_weight="balanced",
            random_state=42,
            max_iter=trial.suggest_int("max_iter", 100, 1000),
            n_jobs=-1,
        )

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = f1_score(y_val, preds, average="macro")

        scores.append(score)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, n_jobs=-1, show_progress_bar=True)

In [14]:
print_study_results(study)

Study statistics: 
 Number of finished trials: 25
 Number of pruned trials: 0
 Number of complete trials: 25
Best trial:
 Value: 0.6173788773197413
 Params:
  C: 0.0027661044640621083
  max_iter: 693


In [16]:
study.best_params

{'C': 0.0027661044640621083, 'max_iter': 693}

In [15]:
best_params = {}

In [18]:
split2f1score = {}

for i, ((X_train, y_train), (X_val, y_val)) in tqdm(enumerate(prep_splits), leave=False, desc="Cross-validation"):

    model = LogisticRegression(
            penalty="l2",
            class_weight="balanced",
            random_state=42,
            n_jobs=-1,
            **study.best_params
        )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = f1_score(y_val, preds, average="macro")

    split2f1score[i] = score

Cross-validation: 0it [00:00, ?it/s]

In [19]:
split2f1score

{0: 0.6029279551665013,
 1: 0.5196838733852344,
 2: 1.0,
 3: 0.5732028765751973,
 4: 0.5785862927022623,
 5: 0.583605971727378,
 6: 0.6821460587142945,
 7: 0.5863974510456219,
 8: 0.48179986963472254,
 9: 0.5654384242462004}

In [30]:
for i, j in prep_splits:
    print(i[1].shape, j[1].shape)

(64486,) (132712,)
(186996,) (10202,)
(197195,) (3,)
(190013,) (7185,)
(189332,) (7866,)
(190151,) (7047,)
(189943,) (7255,)
(190180,) (7018,)
(191691,) (5507,)
(184795,) (12403,)


In [None]:
submission = pd.DataFrame({"id": test_ids, "prediction": final_model.predict(test)})
submission.to_csv("submission.csv", index=False)

## GradientBoostingClassifier

In [None]:
def objective(trial):
    scores = []
    for train_ids, val_ids in tqdm(splits, leave=False):
        X_train, X_val = train.iloc[train_ids].drop(columns=["resolution"]), train.iloc[val_ids].drop(
            columns=["resolution"]
        )
        y_train, y_val = train["resolution"].iloc[train_ids], train["resolution"].iloc[val_ids]

        model = GradientBoostingClassifier(
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            n_estimators=trial.suggest_int("n_estimators", 100, 1000),
            subsample=trial.suggest_float("subsample", 0.5, 1),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 50),
            min_weight_fraction_leaf=trial.suggest_float("min_weight_fraction_leaf", 0.0, 0.5),
            max_depth=trial.suggest_int("max_depth", 3, 25),
            min_impurity_decrease=trial.suggest_float("min_impurity_decrease", 0.0, 0.5),
            random_state=42,
            max_features=trial.suggest_float("max_features", 0.5, 1.0),
            validation_fraction=0.1,
            n_iter_no_change=25,
        )

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = f1_score(y_val, preds, average="macro")

        scores.append(score)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, n_jobs=-1, show_progress_bar=True)

[I 2025-08-27 17:17:29,079] A new study created in memory with name: no-name-5fa2cc51-7979-48ad-a00a-54be7db5ec25


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:27:48,151] Trial 0 finished with value: 0.4828897338403042 and parameters: {'learning_rate': 0.08611754378703265, 'n_estimators': 891, 'subsample': 0.5492454381887486, 'min_samples_split': 7, 'min_samples_leaf': 38, 'min_weight_fraction_leaf': 0.47232869163041713, 'max_depth': 6, 'min_impurity_decrease': 0.36334000689386386, 'max_features': 0.7176446969354893}. Best is trial 0 with value: 0.4828897338403042.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:41:36,108] Trial 1 finished with value: 0.7436471439198398 and parameters: {'learning_rate': 0.28599737124164804, 'n_estimators': 902, 'subsample': 0.9314376911670305, 'min_samples_split': 4, 'min_samples_leaf': 17, 'min_weight_fraction_leaf': 0.10844978622460372, 'max_depth': 7, 'min_impurity_decrease': 0.22133121495889352, 'max_features': 0.8710606174191504}. Best is trial 1 with value: 0.7436471439198398.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:47:20,743] Trial 3 finished with value: 0.5707357298031969 and parameters: {'learning_rate': 0.28245032558032657, 'n_estimators': 652, 'subsample': 0.8020944262719305, 'min_samples_split': 2, 'min_samples_leaf': 7, 'min_weight_fraction_leaf': 0.38367232549776714, 'max_depth': 19, 'min_impurity_decrease': 0.07299788335107882, 'max_features': 0.9215213039104746}. Best is trial 1 with value: 0.7436471439198398.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:55:10,420] Trial 4 finished with value: 0.6573900573703279 and parameters: {'learning_rate': 0.20792702696654936, 'n_estimators': 551, 'subsample': 0.8066006881876278, 'min_samples_split': 2, 'min_samples_leaf': 18, 'min_weight_fraction_leaf': 0.23757121231070077, 'max_depth': 7, 'min_impurity_decrease': 0.030069252369418542, 'max_features': 0.600874851105353}. Best is trial 1 with value: 0.7436471439198398.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:56:15,829] Trial 6 finished with value: 0.7646218788411101 and parameters: {'learning_rate': 0.280500728857195, 'n_estimators': 603, 'subsample': 0.982649399026074, 'min_samples_split': 8, 'min_samples_leaf': 48, 'min_weight_fraction_leaf': 0.024803530472957358, 'max_depth': 21, 'min_impurity_decrease': 0.3800344244422298, 'max_features': 0.6302701714411555}. Best is trial 6 with value: 0.7646218788411101.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 17:58:38,460] Trial 5 finished with value: 0.4828897338403042 and parameters: {'learning_rate': 0.02282597114692983, 'n_estimators': 890, 'subsample': 0.5656175805898258, 'min_samples_split': 6, 'min_samples_leaf': 42, 'min_weight_fraction_leaf': 0.48236612485945074, 'max_depth': 22, 'min_impurity_decrease': 0.18492751400817625, 'max_features': 0.800795330928231}. Best is trial 6 with value: 0.7646218788411101.


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-08-27 18:10:43,880] Trial 8 finished with value: 0.7975256812491976 and parameters: {'learning_rate': 0.21642920630412593, 'n_estimators': 395, 'subsample': 0.5798151502345368, 'min_samples_split': 2, 'min_samples_leaf': 4, 'min_weight_fraction_leaf': 0.009099225776470043, 'max_depth': 23, 'min_impurity_decrease': 0.363135995170654, 'max_features': 0.9105899704729185}. Best is trial 8 with value: 0.7975256812491976.
[I 2025-08-27 18:25:37,559] Trial 2 finished with value: 0.7932285555048356 and parameters: {'learning_rate': 0.09543870713707266, 'n_estimators': 822, 'subsample': 0.7111549920621226, 'min_samples_split': 8, 'min_samples_leaf': 20, 'min_weight_fraction_leaf': 0.04672474781249436, 'max_depth': 15, 'min_impurity_decrease': 0.15013062712114322, 'max_features': 0.7739045304080521}. Best is trial 8 with value: 0.7975256812491976.
[I 2025-08-27 18:27:14,345] Trial 9 finished with value: 0.5308121635395173 and parameters: {'learning_rate': 0.1334438056721644, 'n_estimator

In [None]:
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

In [None]:
print("Study statistics: ")
print(" Number of finished trials:", len(study.trials))
print(" Number of pruned trials:", len(pruned_trials))
print(" Number of complete trials:", len(complete_trials))

print("Best trial:")
trial = study.best_trial
print(" Value:", trial.value)
print(" Params:")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

Study statistics: 
 Number of finished trials: 10
 Number of pruned trials: 0
 Number of complete trials: 10
Best trial:
 Value: 0.8471947683130238
 Params:
  learning_rate: 0.18083201022490278
  n_estimators: 836
  subsample: 0.5288785002220615
  min_samples_split: 5
  min_samples_leaf: 39
  min_weight_fraction_leaf: 0.04000615012124609
  max_depth: 23
  min_impurity_decrease: 0.018732856779112794
  max_features: 0.5571432552599271


In [None]:
# Fit final model with best hyperparameters on whole train
final_model = GradientBoostingClassifier(
    validation_fraction=0.1, n_iter_no_change=25, random_state=42, **study.best_params
)
final_model.fit(train.drop(columns=["resolution"]), train["resolution"])

0,1,2
,loss,'log_loss'
,learning_rate,0.18083201022490278
,n_estimators,836
,subsample,0.5288785002220615
,criterion,'friedman_mse'
,min_samples_split,5
,min_samples_leaf,39
,min_weight_fraction_leaf,0.04000615012124609
,max_depth,23
,min_impurity_decrease,0.018732856779112794


In [None]:
submission = pd.DataFrame({"id": test_ids, "prediction": final_model.predict(test)})
submission.to_csv("submission.csv", index=False)