In [1]:
%reload_ext autoreload
%autoreload 2

%cd ..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

from src.optuna_utils import print_study_results
from src.splits import create_stratified_splits

In [None]:
train = pd.read_csv("data/2__train_with_flags_num_cat.csv")
test = pd.read_csv("data/2__test_with_flags_num_cat.csv")

In [None]:
ratio = (train["resolution"] == 0).sum() / (train["resolution"] == 1).sum()
ratio = float(ratio)
ratio

14.108642353662274

In [None]:
# Preprocessing and splitting

cols_to_drop = ["description", "name_rus", "ItemID", "SellerID", "id"]
train = train.drop(columns=cols_to_drop)

test_ids = test["id"]  # нужно для сабмита
test = test.drop(columns=cols_to_drop)

splits = create_stratified_splits(data=train, n_splits=5)

cols_to_drop = ["brand_name", "CommercialTypeName4"]
train = train.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)

In [None]:
# Common params
param = {
    "device": "cpu",
    "validate_parameters": True,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42,
    "verbosity": 0,
}

In [None]:
def objective(trial):
    study_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.5),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 32),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [1, ratio / 2, ratio, ratio * 2]),
    }
    scores = []

    for train_idx, val_idx in tqdm(splits, leave=False, desc="Cross-validation"):
        dtrain = xgb.DMatrix(
            train.iloc[train_idx].drop(columns=["resolution"]), label=train.iloc[train_idx]["resolution"]
        )
        dval = xgb.DMatrix(train.iloc[val_idx].drop(columns=["resolution"]), label=train.iloc[val_idx]["resolution"])

        model = xgb.train(
            params={**param, **study_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
        )

        pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        pred_labels = (pred_probas > 0.5).astype(int)

        score = f1_score(train.loc[val_idx, "resolution"].to_numpy(), pred_labels, average="macro")
        scores.append(score)

    return sum(scores) / len(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

[I 2025-08-28 14:51:14,947] A new study created in memory with name: no-name-4a505c03-6c8e-4503-9042-cf6ab0bae258


  0%|          | 0/100 [00:00<?, ?it/s]

[0]	val-logloss:0.53453
[0]	val-logloss:0.76814
[1]	val-logloss:0.44872
[1]	val-logloss:0.62179
[2]	val-logloss:0.39967
[2]	val-logloss:0.53190
[3]	val-logloss:0.36159
[3]	val-logloss:0.46708
[4]	val-logloss:0.41627
[4]	val-logloss:0.33213
[5]	val-logloss:0.30854
[0]	val-logloss:0.53824
[6]	val-logloss:0.29299
[5]	val-logloss:0.37811
[7]	val-logloss:0.27996
[8]	val-logloss:0.27220
[9]	val-logloss:0.26900
[6]	val-logloss:0.35129
[1]	val-logloss:0.36975
[0]	val-logloss:0.61320
[7]	val-logloss:0.32677
[2]	val-logloss:0.27547
[8]	val-logloss:0.30509
[1]	val-logloss:0.43330
[3]	val-logloss:0.22039
[9]	val-logloss:0.29078
[0]	val-logloss:0.53302
[1]	val-logloss:0.44941
[4]	val-logloss:0.18806
[2]	val-logloss:0.39072
[2]	val-logloss:0.32605
[3]	val-logloss:0.35630
[4]	val-logloss:0.33175


In [None]:
print_study_results(study)

Study statistics: 
 Number of finished trials: 100
 Number of pruned trials: 0
 Number of complete trials: 100
Best trial:
 Value: 0.8673045154860317
 Params:
  learning_rate: 0.3047198717390245
  gamma: 0.3655997787871119
  max_depth: 16
