## Import the libraries

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold, train_test_split
import xgboost as xgb
import optuna
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [7]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('testSubmissionFile.csv')

# Display first few rows
train.head()

Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold
0,001ed16b-dd08-4599-b8ef-4f56a373c454_6e5f1087-...,1603815466,120706,68203,1ae7c2d3c28b711c072d8e2eb3869fa59090669bdc153e...,US,Windows,Chrome,86_0,PC,js-web,2,safe,False
1,0024b36a-4fb5-4070-88fb-fc0bfb1909ed,1603974586,69454,42543,df1108bf6ae49dbccf5eab60ff9d04a6a09dda60ec7290...,RO,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False
2,003630fa-ad63-4283-be1b-141670132d70_f37c2b23-...,1604229969,100170,57703,cc6957e8aec85a4d920991c53874c5d0780bbfbd469802...,UK,Android,Facebook App,294_0,Phone,js-web,2,safe,True
3,0048c65a-ce76-43ba-98d2-8e87607468f8,1604156610,100446,57797,7fc0bb7a65d074e003cce786cda2b070f80dd47179c4b9...,ES,Android,Chrome Mobile,86_0,Phone,js-ampsf,1,safe,True
4,0056b8a7-54f9-4ac8-8d50-f725bf377872,1604004493,119517,67613,3a6552ccbf66ad166aa9005c3e08f70716abd676cfd87b...,FR,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False


## Preprocessing

In [8]:
# --------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------
def add_cyclic_features(df, col, period):
    df[f"{col}_sin"] = np.sin(2 * np.pi * df[col] / period)
    df[f"{col}_cos"] = np.cos(2 * np.pi * df[col] / period)
    return df

def frequency_encode(train_col, test_col, min_count=10):
    freq = train_col.value_counts()
    freq[freq < min_count] = min_count
    freq = freq / freq.sum()
    return train_col.map(freq).fillna(0), test_col.map(freq).fillna(0)

def add_count_feature(train_df, test_df, col):
    cnt = train_df[col].value_counts()
    train_df[f"{col}_cnt"] = train_df[col].map(cnt).fillna(1)
    test_df[f"{col}_cnt"]  = test_df[col].map(cnt).fillna(1)
    return train_df, test_df

def add_target_mean_encoding(train_df, test_df, cols, target="isSold",
                             n_splits=5, smoothing=20):
    global_mean = train_df[target].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for col in cols:
        oof = pd.Series(index=train_df.index, dtype=float)

        for tr_idx, val_idx in kf.split(train_df):
            tr, val = train_df.iloc[tr_idx], train_df.iloc[val_idx]
            stats = tr.groupby(col)[target].agg(["mean", "count"])
            smooth = (stats["count"]*stats["mean"] + smoothing*global_mean) / \
                     (stats["count"] + smoothing)
            oof.iloc[val_idx] = val[col].map(smooth).fillna(global_mean)

        train_df[f"{col}_te"] = oof

        full_stats = train_df.groupby(col)[target].agg(["mean", "count"])
        smooth_full = (full_stats["count"]*full_stats["mean"] + smoothing*global_mean) / \
                      (full_stats["count"] + smoothing)
        test_df[f"{col}_te"] = test_df[col].map(smooth_full).fillna(global_mean)

    return train_df, test_df

# --------------------------------------------------------------------
# Main preprocessing
# --------------------------------------------------------------------
def preprocess(train_path: str | Path,
               test_path: str | Path,
               val_size: float = 0.2,
               random_state: int = 42):
    # 1. Load
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    test_ids = test["auctionId"].copy()

    # 2. Time features
    for df in (train, test):
        df["datetime"] = pd.to_datetime(df["timeStamp"], unit="s")

        df["year"]       = df["datetime"].dt.year
        df["month"]      = df["datetime"].dt.month
        df["day"]        = df["datetime"].dt.day
        df["hour"]       = df["datetime"].dt.hour
        df["dayofweek"]  = df["datetime"].dt.dayofweek
        df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype("int8")

        add_cyclic_features(df, "hour",      24)
        add_cyclic_features(df, "dayofweek",  7)
        add_cyclic_features(df, "month",     12)

        df["is_night"]   = df["hour"].between(0, 5).astype("int8")
        df["is_evening"] = df["hour"].between(18, 23).astype("int8")

    t0 = train["datetime"].min()
    for df in (train, test):
        df["days_since_start"] = (df["datetime"] - t0).dt.total_seconds() / 86_400

    train.drop(["timeStamp", "datetime"], axis=1, inplace=True)
    test.drop(["timeStamp", "datetime"],   axis=1, inplace=True)

    # 3. High-cardinality encodings
    hi_card_cols = ["hashedRefererDeepThree", "browserVersion"]
    for col in hi_card_cols:
        tr_freq, te_freq = frequency_encode(train[col], test[col], min_count=20)
        train[f"{col}_freq"] = tr_freq
        test[f"{col}_freq"]  = te_freq

        train, test = add_count_feature(train, test, col)

        if train[col].nunique() > 1_000:
            train.drop(columns=[col], inplace=True)
            test.drop(columns=[col], inplace=True)

    # 4. Target mean encoding (placementId, websiteId)
    train, test = add_target_mean_encoding(
        train, test,
        cols=["placementId", "websiteId"],
        target="isSold",
        n_splits=5,
        smoothing=20
    )

    # Facultatif : on peut aussi dropper les brutes si trop volumineuses
    train.drop(columns=["placementId", "websiteId"], inplace=True)
    test.drop(columns=["placementId", "websiteId"],  inplace=True)

    # 5. Identifiants
    train.drop(["auctionId"], axis=1, inplace=True)
    test.drop(["auctionId"],  axis=1, inplace=True)

    # 6. Target / features
    y = train["isSold"].copy()
    X = train.drop("isSold", axis=1)

    # 7. One-hot des petites catégorielles
    small_cat = [
        "country", "opeartingSystem", "browser", "device",
        "environmentType", "articleSafenessCategorization",
        "dayofweek", "month"
    ]
    small_cat = [c for c in small_cat if c in X.columns]

    X_test_merge = pd.concat([X, test], axis=0, sort=False)
    X_test_merge = pd.get_dummies(
        X_test_merge,
        columns=small_cat,
        dummy_na=False
    )

    X_enc    = X_test_merge.iloc[: len(X), :].reset_index(drop=True)
    test_enc = X_test_merge.iloc[len(X):, :].reset_index(drop=True)

    # 8. Train / validation split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_enc, y,
        test_size=val_size,
        random_state=random_state,
        stratify=y
    )

    print(f"🏷️  Train shape : {X_tr.shape}")
    print(f"🏷️  Val   shape : {X_val.shape}")
    print(f"🏷️  Test  shape : {test_enc.shape}")

    return X_tr, X_val, y_tr, y_val, test_enc, test_ids

## XGBoost with optuna

In [None]:
##############################################################################
# XGBoost + Optuna – 1 M lignes  • RTX 4060  • objectif F1
#  • échantillon 30 % (≈300 k) pour le tuning
#  • 50 essais  • Successive-Halving  • 3-fold CV
#  • borne haute 8 000 arbres  • early-stop 150
##############################################################################
import numpy as np, pandas as pd, optuna, xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from optuna.integration import XGBoostPruningCallback
from optuna.pruners import SuccessiveHalvingPruner

# ------------------------------------------------------------------#
# 1. Pré-traitement                                                 #
# ------------------------------------------------------------------#
X_tr, X_val, y_tr, y_val, X_test, test_ids = preprocess(
    "train.csv", "test.csv", val_size=0.2, random_state=42
)
X_tr  = pd.concat([X_tr,  X_val], ignore_index=True)   # full train
y_tr  = pd.concat([y_tr,  y_val], ignore_index=True)

# ------------------------------------------------------------------#
# 2. Conversion numérique                                           #
# ------------------------------------------------------------------#
def make_numeric(df):
    df = df.copy()
    cats = df.select_dtypes(["object", "category"]).columns
    df[cats] = df[cats].astype("category").apply(lambda s: s.cat.codes).astype("int32")
    df[df.select_dtypes("bool").columns] = df.select_dtypes("bool").astype("uint8")
    return df.astype("float32")

X_tr_n, X_test_n = map(make_numeric, [X_tr, X_test])

# ------------------------------------------------------------------#
# 3. Sous-échantillon 30 % pour Optuna                              #
# ------------------------------------------------------------------#
sub_idx = y_tr.sample(frac=1, random_state=42).index
X_sub, y_sub = X_tr_n.loc[sub_idx], y_tr.loc[sub_idx]
spw = (len(y_sub) - y_sub.sum()) / y_sub.sum()          # déséquilibre

# ------------------------------------------------------------------#
# 4. Objective Optuna 3-fold CV + pruning                           #
# ------------------------------------------------------------------#
pruner = SuccessiveHalvingPruner(min_resource=200, reduction_factor=3)

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "eta":   trial.suggest_float("eta", 0.005, 0.15, log=True),
        "max_depth":        trial.suggest_int("max_depth", 4, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "gamma":            trial.suggest_float("gamma", 1e-8, 5.0, log=True),
        "subsample":        trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda":           trial.suggest_float("lambda", 1e-3, 100.0, log=True),
        "alpha":            trial.suggest_float("alpha",  1e-3, 30.0,  log=True),
        "max_bin":          trial.suggest_categorical("max_bin", [64, 128, 256]),
        "grow_policy":      trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "scale_pos_weight": spw,
    }
    thresh = trial.suggest_float("threshold", 0.25, 0.75)

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    preds, truths, iters = [], [], []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_sub, y_sub)):
        dtr = xgb.DMatrix(X_sub.iloc[tr_idx], label=y_sub.iloc[tr_idx])
        dva = xgb.DMatrix(X_sub.iloc[va_idx], label=y_sub.iloc[va_idx])

        # ---------- callbacks : aucun "None" dans la liste ----------
        cbs = [xgb.callback.EarlyStopping(rounds=150)]
        if fold == 0:                     # pruner appliqué 1 fois suffit
            cbs.append(XGBoostPruningCallback(trial, "val-auc"))

        bst = xgb.train(
            params, dtr,
            num_boost_round=8000,
            evals=[(dva, "val")],
            callbacks=cbs,
            verbose_eval=False
        )
        iters.append(bst.best_iteration)
        preds.append(bst.predict(dva, iteration_range=(0, bst.best_iteration)))
        truths.append(y_sub.iloc[va_idx].values)

    preds  = np.concatenate(preds)
    truths = np.concatenate(truths)

    # seuil F1 optimisé brutalement sur 21 valeurs
    th_grid = np.linspace(0.25, 0.75, 21)
    best_f1 = max(f1_score(truths, preds >= t) for t in th_grid)
    best_t  = th_grid[np.argmax([f1_score(truths, preds >= t) for t in th_grid])]

    trial.set_user_attr("best_iter",   int(np.mean(iters)))
    trial.set_user_attr("best_thresh", float(best_t))
    return best_f1

study = optuna.create_study(direction="maximize", pruner=pruner)
study.optimize(objective, n_trials=50, show_progress_bar=True)

best_params = study.best_params.copy()
best_iter   = int(study.best_trial.user_attrs["best_iter"])
best_thresh = study.best_trial.user_attrs["best_thresh"]
best_params.update({"tree_method": "gpu_hist", "predictor": "gpu_predictor"})

print(f"🏆  F1 CV = {study.best_value:.4f} | iter ≈ {best_iter} | thr = {best_thresh:.3f}")

# ------------------------------------------------------------------#
# 5. Entraînement final sur 100 % du train                          #
# ------------------------------------------------------------------#
dtrain_full = xgb.DMatrix(X_tr_n, label=y_tr)
final = xgb.train(best_params, dtrain_full, num_boost_round=best_iter)

# ------------------------------------------------------------------#
# 6. Prédiction test + soumission                                   #
# ------------------------------------------------------------------#
proba  = final.predict(xgb.DMatrix(X_test_n), iteration_range=(0, best_iter))
labels = (proba >= best_thresh).astype(int)

pd.DataFrame({"auctionId": test_ids, "isSold": labels}) \
  .to_csv("submission_xgb_optuna_labels_goat.csv", index=False)
print("✅  submission_xgb_optuna_labels_goat.csv enregistré")


🏷️  Train shape : (773865, 403)
🏷️  Val   shape : (193467, 403)
🏷️  Test  shape : (242171, 403)


[I 2025-06-04 17:18:03,184] A new study created in memory with name: no-name-f661a25b-4484-4542-b28e-72cc2f99cb8f


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# # ------------------------------------------------------------------
# # 1. Pré-traitement
# # ------------------------------------------------------------------
# X_tr, X_val, y_tr, y_val, X_test, test_ids = preprocess(
#     "train.csv", "test.csv", val_size=0.2, random_state=42
# )

# # ------------------------------------------------------------------
# # 2. Conversion numérique
# # ------------------------------------------------------------------
# def make_numeric(df):
#     df = df.copy()
#     df[df.select_dtypes("category").columns] = (
#         df.select_dtypes("category").apply(lambda s: s.cat.codes).astype("int32")
#     )
#     df[df.select_dtypes("object").columns] = (
#         df.select_dtypes("object").apply(lambda s: s.astype("category").cat.codes).astype("int32")
#     )
#     df[df.select_dtypes("bool").columns] = df.select_dtypes("bool").astype("uint8")
#     return df.astype("float32")

# X_tr_n, X_val_n, X_test_n = map(make_numeric, [X_tr, X_val, X_test])
# dtrain = xgb.DMatrix(X_tr_n, label=y_tr)
# dval   = xgb.DMatrix(X_val_n, label=y_val)

# spw = (len(y_tr) - y_tr.sum()) / y_tr.sum()

# # ------------------------------------------------------------------
# # 3. Optuna (15 essais) – objectif F1
# # ------------------------------------------------------------------
# def objective(trial):
#     params = {
#         "objective": "binary:logistic",
#         "eval_metric": "auc",
#         "tree_method": "gpu_hist",     # RTX 4060
#         "eta": trial.suggest_float("eta", 0.01, 0.2, log=True),
#         "max_depth": trial.suggest_int("max_depth", 4, 10),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "gamma": trial.suggest_float("gamma", 1e-8, 5.0, log=True),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "lambda": trial.suggest_float("lambda", 1e-3, 30.0, log=True),
#         "alpha":  trial.suggest_float("alpha", 1e-3, 10.0, log=True),
#         "max_bin": trial.suggest_categorical("max_bin", [128, 256]),
#         "scale_pos_weight": spw,
#     }
#     thresh = trial.suggest_float("threshold", 0.3, 0.7)

#     booster = xgb.train(
#         params,
#         dtrain,
#         num_boost_round=4000,
#         evals=[(dval, "val")],
#         callbacks=[
#             xgb.callback.EarlyStopping(rounds=200),
#         ],
#         verbose_eval=False,
#     )

#     val_proba = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
#     f1 = f1_score(y_val, (val_proba >= thresh).astype(int))

#     trial.set_user_attr("best_iter", booster.best_iteration)
#     return f1

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=15, show_progress_bar=True)    # ≃2 min GPU

# best_params = study.best_params.copy()
# best_thresh = best_params.pop("threshold")
# best_iter   = study.best_trial.user_attrs["best_iter"]

# # 🔧  force à nouveau les paramètres GPU
# best_params.update({
#     "tree_method": "gpu_hist",
#     "predictor":   "gpu_predictor"   # optionnel ≥ 2.0, se règle sinon auto
# })

# # ------------------------------------------------------------------
# # 4. Ré-entraîne sur train+val
# # ------------------------------------------------------------------
# X_full_n = make_numeric(pd.concat([X_tr, X_val]))
# y_full   = pd.concat([y_tr, y_val])
# dtrain_full = xgb.DMatrix(X_full_n, label=y_full)

# final = xgb.train(best_params, dtrain_full, num_boost_round=best_iter)

# # ------------------------------------------------------------------
# # 5. Prédiction test + soumission (labels 0/1)
# # ------------------------------------------------------------------

# # --------------------------- ré-entraîne --------------------------
# final = xgb.train(best_params, dtrain_full, num_boost_round=best_iter)

# # --------------------------- prédiction test ---------------------
# dtest = xgb.DMatrix(X_test_n)
# proba = final.predict(dtest, iteration_range=(0, best_iter))   # ✅

# labels = (proba >= best_thresh).astype(int)

# pd.DataFrame({"auctionId": test_ids, "isSold": labels}).to_csv(
#     "submission_xgb_optuna_labels.csv", index=False
# )
# print("✅ submission_xgb_optuna_labels.csv enregistré")