In [None]:
%reload_ext autoreload
%autoreload 2

%cd ../..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
from multiprocessing import cpu_count

cpu_count()

4

In [None]:
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

from src.optuna_utils import print_study_results
from src.splits import create_stratified_splits

In [None]:
train = pd.read_csv("data/2__train_with_flags_num_cat.csv")
test = pd.read_csv("data/2__test_with_flags_num_cat.csv")

In [None]:
ratio = (train["resolution"] == 0).sum() / (train["resolution"] == 1).sum()
ratio = float(ratio)
ratio

14.108642353662274

In [None]:
# Preprocessing and splitting

cols_to_drop = ["description", "name_rus", "ItemID", "SellerID", "id"]
train = train.drop(columns=cols_to_drop)

test_ids = test["id"]  # нужно для сабмита
test = test.drop(columns=cols_to_drop)

splits = create_stratified_splits(data=train, n_splits=10)

cols_to_drop = ["brand_name", "CommercialTypeName4"]
train = train.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)

In [None]:
# Common params
param = {
    "device": "cpu",  # "cpu" or "cuda"
    "validate_parameters": True,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42,
    "verbosity": 0,
}

# Find optimal hyper-parameters

In [None]:
def objective(trial):
    study_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.5),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 32),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [1, ratio / 2, ratio, ratio * 2]),
    }
    scores = []

    for train_idx, val_idx in tqdm(splits, leave=False, desc="Cross-validation"):
        dtrain = xgb.DMatrix(
            train.iloc[train_idx].drop(columns=["resolution"]), label=train.iloc[train_idx]["resolution"]
        )
        dval = xgb.DMatrix(train.iloc[val_idx].drop(columns=["resolution"]), label=train.iloc[val_idx]["resolution"])

        model = xgb.train(
            params={**param, **study_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
        )

        pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        pred_labels = (pred_probas > 0.5).astype(int)

        score = f1_score(train.loc[val_idx, "resolution"].to_numpy(), pred_labels, average="macro")
        scores.append(score)

    return sum(scores) / len(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

In [None]:
print_study_results(study)

In [None]:
best_params = {
    "learning_rate": 0.4147330101591056,
    "gamma": 0.18627164220069048,
    "max_depth": 28,
    "min_child_weight": 2,
    "subsample": 0.9273606284420032,
    "lambda": 0.0025309914844835178,
    "alpha": 0.00896253701574146,
    "scale_pos_weight": 7.054321176831137,
}

# Train model with best hyper-parameters

Обучу на каждом сплите модель с early stopping-ом и выберу ту, у которой больше всего f1-score на вал сплите - и сниму с неё предсказания для теста.

In [None]:
f1_score__best_iteration__test_preds = []

for train_idx, val_idx in tqdm(splits, leave=False, desc="Cross-validation"):
    dtrain = xgb.DMatrix(train.iloc[train_idx].drop(columns=["resolution"]), label=train.iloc[train_idx]["resolution"])
    dval = xgb.DMatrix(train.iloc[val_idx].drop(columns=["resolution"]), label=train.iloc[val_idx]["resolution"])

    model = xgb.train(params={**param, **best_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100)

    pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
    pred_labels = (pred_probas > 0.5).astype(int)

    score = f1_score(train.loc[val_idx, "resolution"].to_numpy(), pred_labels, average="macro")

    f1_score__best_iteration__test_preds.append(
        (score, model.best_iteration, model.predict(xgb.DMatrix(test), iteration_range=(0, model.best_iteration + 1)))
    )

Cross-validation:   0%|          | 0/10 [00:00<?, ?it/s]

[0]	val-logloss:0.30154
[1]	val-logloss:0.21869
[2]	val-logloss:0.16953
[3]	val-logloss:0.13879
[4]	val-logloss:0.12002
[5]	val-logloss:0.10800
[6]	val-logloss:0.10036
[7]	val-logloss:0.09582
[8]	val-logloss:0.09291
[9]	val-logloss:0.09136
[0]	val-logloss:0.30210
[1]	val-logloss:0.22136
[2]	val-logloss:0.17142
[3]	val-logloss:0.14089
[4]	val-logloss:0.12158
[5]	val-logloss:0.10930
[6]	val-logloss:0.10186
[7]	val-logloss:0.09707
[8]	val-logloss:0.09389
[9]	val-logloss:0.09236
[0]	val-logloss:0.30080
[1]	val-logloss:0.21863
[2]	val-logloss:0.16953
[3]	val-logloss:0.13943
[4]	val-logloss:0.12016
[5]	val-logloss:0.10811
[6]	val-logloss:0.10023
[7]	val-logloss:0.09546
[8]	val-logloss:0.09240
[9]	val-logloss:0.09054
[0]	val-logloss:0.30087
[1]	val-logloss:0.21702
[2]	val-logloss:0.16712
[3]	val-logloss:0.13651
[4]	val-logloss:0.11741
[5]	val-logloss:0.10582
[6]	val-logloss:0.09787
[7]	val-logloss:0.09312
[8]	val-logloss:0.08997
[9]	val-logloss:0.08838
[0]	val-logloss:0.30171
[1]	val-logloss:

In [None]:
for i, (f1, best_iter, _) in enumerate(f1_score__best_iteration__test_preds):
    print(f"Split: {i}, F1 Score: {f1}, Best Iteration: {best_iter}")

Split: 0, F1 Score: 0.8762750136747048, Best Iteration: 9
Split: 1, F1 Score: 0.8748249657994542, Best Iteration: 9
Split: 2, F1 Score: 0.8735877063668296, Best Iteration: 9
Split: 3, F1 Score: 0.8817348242680088, Best Iteration: 9
Split: 4, F1 Score: 0.8792397925266568, Best Iteration: 9
Split: 5, F1 Score: 0.8785231084399906, Best Iteration: 9
Split: 6, F1 Score: 0.8743908840663209, Best Iteration: 9
Split: 7, F1 Score: 0.8809947515228599, Best Iteration: 9
Split: 8, F1 Score: 0.8767075073867432, Best Iteration: 9
Split: 9, F1 Score: 0.8756518859262952, Best Iteration: 9


In [None]:
submission = pd.DataFrame(
    {"id": test_ids, "prediction": (f1_score__best_iteration__test_preds[3][2] > 0.5).astype(int)}
)
submission.to_csv("submission.csv", index=False)