In [1]:
%reload_ext autoreload
%autoreload 2

%cd ../..

/home/den/dev/git/ozon-e-cup-2025


In [None]:
from multiprocessing import cpu_count

cpu_count()

4

In [None]:
import json

import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

from src.optuna_utils import print_study_results
from src.splits import create_stratified_splits

# Data preparation

## Img emb

In [None]:
train_img_emb = np.load("data/clip_image_embs_vit16.npy")
train_img_ids = json.load(open("data/clip_image_ids_vit16_ids.json"))
test_img_emb = np.load("data/clip_image_embs_vit16_test.npy")

In [None]:
train_num = pd.read_csv("data/2__train_with_flags_num_cat.csv")
test_num = pd.read_csv("data/2__test_with_flags_num_cat.csv")

In [None]:
train_basic_text = pd.read_csv("data/X_train_text_features_basic.csv")
test_basic_text = pd.read_csv("data/X_test_text_features_basic.csv")

In [None]:
set(train_num.columns) & set(train_basic_text.columns)

{'id'}

In [None]:
set(test_num.columns) & set(test_basic_text.columns)

{'id'}

In [None]:
train = pd.merge(train_num, train_basic_text, on="id", how="left")
test = pd.merge(test_num, test_basic_text, on="id", how="left")

In [None]:
assert train.shape[0] == train_num.shape[0]
assert test.shape[0] == test_num.shape[0]

In [None]:
ratio = (train["resolution"] == 0).sum() / (train["resolution"] == 1).sum()
ratio = float(ratio)
ratio

14.108642353662274

In [None]:
cols_to_drop = ["description", "name_rus", "ItemID", "SellerID", "id"]
train = train.drop(columns=cols_to_drop)

test_ids = test["id"]  # нужно для сабмита
test = test.drop(columns=cols_to_drop)

splits = create_stratified_splits(data=train, n_splits=10)

cols_to_drop = ["brand_name", "CommercialTypeName4"]
train = train.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)

# Find optimal hyper-parameters

In [None]:
# Common params
param = {
    "device": "cpu",  # "cpu" or "cuda"
    "validate_parameters": True,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42,
    "verbosity": 0,
}

In [None]:
def objective(trial):
    study_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.5),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 32),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [1, ratio / 2, ratio, ratio * 2]),
    }
    scores = []

    for train_idx, val_idx in tqdm(splits, leave=False, desc="Cross-validation"):
        dtrain = xgb.DMatrix(
            train.iloc[train_idx].drop(columns=["resolution"]), label=train.iloc[train_idx]["resolution"]
        )
        dval = xgb.DMatrix(train.iloc[val_idx].drop(columns=["resolution"]), label=train.iloc[val_idx]["resolution"])

        model = xgb.train(
            params={**param, **study_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100
        )

        pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        pred_labels = (pred_probas > 0.5).astype(int)

        score = f1_score(train.loc[val_idx, "resolution"].to_numpy(), pred_labels, average="macro")
        scores.append(score)

    return sum(scores) / len(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

In [None]:
print_study_results(study)

In [None]:
best_params = {}

# Train model with best hyper-parameters

In [None]:
f1_score__best_iteration__test_preds = []

for train_idx, val_idx in tqdm(splits, leave=False, desc="Cross-validation"):
    dtrain = xgb.DMatrix(train.iloc[train_idx].drop(columns=["resolution"]), label=train.iloc[train_idx]["resolution"])
    dval = xgb.DMatrix(train.iloc[val_idx].drop(columns=["resolution"]), label=train.iloc[val_idx]["resolution"])

    model = xgb.train(params={**param, **best_params}, dtrain=dtrain, evals=[(dval, "val")], early_stopping_rounds=100)

    pred_probas = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
    pred_labels = (pred_probas > 0.5).astype(int)

    score = f1_score(train.loc[val_idx, "resolution"].to_numpy(), pred_labels, average="macro")

    f1_score__best_iteration__test_preds.append(
        (score, model.best_iteration, model.predict(xgb.DMatrix(test), iteration_range=(0, model.best_iteration + 1)))
    )

In [None]:
for i, (f1, best_iter, _) in enumerate(f1_score__best_iteration__test_preds):
    print(f"Split: {i}, F1 Score: {f1}, Best Iteration: {best_iter}")

In [None]:
submission = pd.DataFrame(
    {"id": test_ids, "prediction": (f1_score__best_iteration__test_preds[3][2] > 0.5).astype(int)}
)
submission.to_csv("submission.csv", index=False)