In [1]:
%reload_ext autoreload
%autoreload 2

%cd ../..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
from multiprocessing import cpu_count

cpu_count()

4

In [None]:
import optuna
import pandas as pd
import xgboost as xgb
from IPython.display import clear_output
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from tqdm.auto import tqdm

import src.preprocessing as prep
from src.optuna_utils import print_study_results

In [None]:
train = pd.read_csv("data/1__train_with_flags.csv")
test = pd.read_csv("data/1__test_with_flags.csv")

In [None]:
ratio = (train["resolution"] == 0).sum() / (train["resolution"] == 1).sum()
ratio = float(ratio)
ratio

14.108642353662274

# Preparation

In [None]:
# Splitting
sgkf = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=42)
splits = []
for i, (train_index, test_index) in enumerate(sgkf.split(X=train, y=train["resolution"], groups=train["SellerID"])):
    splits.append((train_index, test_index))

In [None]:
for i, j in splits:
    total = len(i) + len(j)
    print(i.shape, j.shape, f"({len(i)/total:.2%} train, {len(j)/total:.2%} val)")

(175745,) (21453,) (89.12% train, 10.88% val)
(174918,) (22280,) (88.70% train, 11.30% val)
(176593,) (20605,) (89.55% train, 10.45% val)
(182218,) (14980,) (92.40% train, 7.60% val)
(178868,) (18330,) (90.70% train, 9.30% val)
(177635,) (19563,) (90.08% train, 9.92% val)
(180594,) (16604,) (91.58% train, 8.42% val)
(177695,) (19503,) (90.11% train, 9.89% val)
(170263,) (26935,) (86.34% train, 13.66% val)
(180253,) (16945,) (91.41% train, 8.59% val)


In [None]:
# Preprocessing
train["SellerID"] = train["SellerID"].astype("string")
test["SellerID"] = test["SellerID"].astype("string")

cols_to_drop = ["id", "description", "name_rus", "ItemID"]
train = train.drop(columns=cols_to_drop)

test_ids = test["id"]
test = test.drop(columns=cols_to_drop)

In [None]:
# Для ускорения - заранее применим prep_pipe ко всем сплитам
prep_splits = []
prep_pipes = []
for train_ids, val_ids in tqdm(splits, total=len(splits)):
    X_train, y_train = train.loc[train_ids].drop("resolution", axis=1), train["resolution"].loc[train_ids]
    X_val, y_val = train.loc[val_ids].drop("resolution", axis=1), train["resolution"].loc[val_ids]

    prep_pipe = Pipeline(
        steps=[
            ("num", prep.NUMERIC_PREP_PIPELINE),
            ("cat", prep.CATEGORIAL_PREP_PIPELINE),
        ],
        verbose=False,
    )
    prep_pipe.fit(X_train)
    prep_pipes.append(prep_pipe)

    X_train = prep_pipe.transform(X_train, is_train=True)
    X_val = prep_pipe.transform(X_val, is_train=False)

    prep_splits.append(((X_train, y_train), (X_val, y_val)))

clear_output(wait=False)

In [None]:
# Common params
param = {
    "device": "cpu",  # "cpu" or "cuda"
    "validate_parameters": True,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42,
    "verbosity": 0,
}

# Find optimal hyper-parameters

In [None]:
def objective(trial):
    study_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.75),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 32),
        "min_child_weight": trial.suggest_float("min_child_weight", 0, 5),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [1, ratio / 2, ratio, ratio * 2]),
    }
    scores = []

    for (X_train, y_train), (X_val, y_val) in tqdm(prep_splits, leave=False, desc="Cross-validation"):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params={**param, **study_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
        )

        pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        pred_labels = (pred_probas > 0.5).astype(int)

        score = f1_score(y_val, pred_labels, average="macro")
        scores.append(score)

    return sum(scores) / len(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5, n_jobs=-1, show_progress_bar=True)

In [None]:
print_study_results(study)

Study statistics: 
 Number of finished trials: 5
 Number of pruned trials: 0
 Number of complete trials: 5
Best trial:
 Value: 0.7293424925252011
 Params:
  learning_rate: 0.25722217273296694
  gamma: 2.386996117330141
  max_depth: 11
  min_child_weight: 1.6538608904056773
  subsample: 0.9904351766097355
  lambda: 0.0018662189518120168
  alpha: 0.0029180230425823327
  scale_pos_weight: 7.054321176831137


# Train model with best hyper-parameters

Обучу на каждом сплите модель с early stopping-ом и выберу ту, у которой больше всего f1-score на вал сплите - и сниму с неё предсказания для теста.

In [None]:
f1_score__best_iteration = []

for (X_train, y_train), (X_val, y_val) in tqdm(prep_splits, leave=False, desc="Cross-validation"):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(
        params={**param, **study.best_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
    )

    pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
    pred_labels = (pred_probas > 0.5).astype(int)

    score = f1_score(y_val, pred_labels, average="macro")

    f1_score__best_iteration.append((score, model.best_iteration))

Cross-validation:   0%|          | 0/10 [00:00<?, ?it/s]

[0]	val-logloss:0.39534
[1]	val-logloss:0.35043
[2]	val-logloss:0.31497
[3]	val-logloss:0.28811
[4]	val-logloss:0.26788
[5]	val-logloss:0.25431
[6]	val-logloss:0.24449
[7]	val-logloss:0.23982
[8]	val-logloss:0.23697
[9]	val-logloss:0.23150
[0]	val-logloss:0.39865
[1]	val-logloss:0.34727
[2]	val-logloss:0.31674
[3]	val-logloss:0.29222
[4]	val-logloss:0.27334
[5]	val-logloss:0.26324
[6]	val-logloss:0.25138
[7]	val-logloss:0.24501
[8]	val-logloss:0.23185
[9]	val-logloss:0.23047
[0]	val-logloss:0.39615
[1]	val-logloss:0.34894
[2]	val-logloss:0.31733
[3]	val-logloss:0.28768
[4]	val-logloss:0.26677
[5]	val-logloss:0.24879
[6]	val-logloss:0.23204
[7]	val-logloss:0.22538
[8]	val-logloss:0.22142
[9]	val-logloss:0.21794
[0]	val-logloss:0.39578
[1]	val-logloss:0.34746
[2]	val-logloss:0.31588
[3]	val-logloss:0.29571
[4]	val-logloss:0.28017
[5]	val-logloss:0.26518
[6]	val-logloss:0.24928
[7]	val-logloss:0.23717
[8]	val-logloss:0.23001
[9]	val-logloss:0.22087
[0]	val-logloss:0.38747
[1]	val-logloss:

In [None]:
for i, (f1, best_iter) in enumerate(f1_score__best_iteration):
    print(f"Split: {i}, F1 Score: {f1}, Best Iteration: {best_iter}")

Split: 0, F1 Score: 0.7109489697681257, Best Iteration: 9
Split: 1, F1 Score: 0.6715208910245607, Best Iteration: 9
Split: 2, F1 Score: 0.6960800998671216, Best Iteration: 9
Split: 3, F1 Score: 0.7173717783760244, Best Iteration: 9
Split: 4, F1 Score: 0.7592724950355308, Best Iteration: 9
Split: 5, F1 Score: 0.7633646280917197, Best Iteration: 9
Split: 6, F1 Score: 0.7472494753582115, Best Iteration: 9
Split: 7, F1 Score: 0.7326642285739351, Best Iteration: 9
Split: 8, F1 Score: 0.7608696855592645, Best Iteration: 8
Split: 9, F1 Score: 0.7340826735975168, Best Iteration: 9


In [None]:
(X_train, y_train), (X_val, y_val) = prep_splits[3]
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

model = xgb.train(
    params={**param, **study.best_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
)

X_test = prep_pipes[3].transform(test, is_train=False)
dtest = xgb.DMatrix(X_test)
pred_probas = model.predict(dtest, iteration_range=(0, model.best_iteration + 1))
pred_labels = (pred_probas > 0.5).astype(int)

[0]	val-logloss:0.39578
[1]	val-logloss:0.34746
[2]	val-logloss:0.31588
[3]	val-logloss:0.29571
[4]	val-logloss:0.28017
[5]	val-logloss:0.26518
[6]	val-logloss:0.24928
[7]	val-logloss:0.23717
[8]	val-logloss:0.23001
[9]	val-logloss:0.22087




In [None]:
submission = pd.DataFrame({"id": test_ids, "prediction": (pred_labels)})
submission.to_csv("submission.csv", index=False)