## Import the libraries

In [9]:
import os, json, subprocess, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

import xgboost as xgb
import optuna
from optuna.integration import XGBoostPruningCallback
from optuna.pruners import SuccessiveHalvingPruner


## Load the data

In [10]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('testSubmissionFile.csv')

# Display first few rows
train.head()

Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold
0,001ed16b-dd08-4599-b8ef-4f56a373c454_6e5f1087-...,1603815466,120706,68203,1ae7c2d3c28b711c072d8e2eb3869fa59090669bdc153e...,US,Windows,Chrome,86_0,PC,js-web,2,safe,False
1,0024b36a-4fb5-4070-88fb-fc0bfb1909ed,1603974586,69454,42543,df1108bf6ae49dbccf5eab60ff9d04a6a09dda60ec7290...,RO,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False
2,003630fa-ad63-4283-be1b-141670132d70_f37c2b23-...,1604229969,100170,57703,cc6957e8aec85a4d920991c53874c5d0780bbfbd469802...,UK,Android,Facebook App,294_0,Phone,js-web,2,safe,True
3,0048c65a-ce76-43ba-98d2-8e87607468f8,1604156610,100446,57797,7fc0bb7a65d074e003cce786cda2b070f80dd47179c4b9...,ES,Android,Chrome Mobile,86_0,Phone,js-ampsf,1,safe,True
4,0056b8a7-54f9-4ac8-8d50-f725bf377872,1604004493,119517,67613,3a6552ccbf66ad166aa9005c3e08f70716abd676cfd87b...,FR,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False


## Preprocessing

In [11]:
# --------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------
def add_cyclic_features(df, col, period):
    df[f"{col}_sin"] = np.sin(2 * np.pi * df[col] / period)
    df[f"{col}_cos"] = np.cos(2 * np.pi * df[col] / period)
    return df

def frequency_encode(train_col, test_col, min_count=10):
    freq = train_col.value_counts()
    freq[freq < min_count] = min_count
    freq = freq / freq.sum()
    return train_col.map(freq).fillna(0), test_col.map(freq).fillna(0)

def add_count_feature(train_df, test_df, col):
    cnt = train_df[col].value_counts()
    train_df[f"{col}_cnt"] = train_df[col].map(cnt).fillna(1)
    test_df[f"{col}_cnt"]  = test_df[col].map(cnt).fillna(1)
    return train_df, test_df

def add_target_mean_encoding(train_df, test_df, cols, target="isSold", n_splits=5, smoothing=20):
    global_mean = train_df[target].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for col in cols:
        oof = pd.Series(index=train_df.index, dtype=float)

        for tr_idx, val_idx in kf.split(train_df):
            tr, val = train_df.iloc[tr_idx], train_df.iloc[val_idx]
            stats = tr.groupby(col)[target].agg(["mean", "count"])
            smooth = (stats["count"]*stats["mean"] + smoothing*global_mean) / \
                     (stats["count"] + smoothing)
            oof.iloc[val_idx] = val[col].map(smooth).fillna(global_mean)

        train_df[f"{col}_te"] = oof

        full_stats = train_df.groupby(col)[target].agg(["mean", "count"])
        smooth_full = (full_stats["count"]*full_stats["mean"] + smoothing*global_mean) / \
                      (full_stats["count"] + smoothing)
        test_df[f"{col}_te"] = test_df[col].map(smooth_full).fillna(global_mean)

    return train_df, test_df

# --------------------------------------------------------------------
# Main preprocessing
# --------------------------------------------------------------------
def preprocess(train_path: str,
               test_path: str,
               val_size: float = 0.2,
               random_state: int = 42):
    # 1. Load
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    test_ids = test["auctionId"].copy()

    # 2. Time features
    for df in (train, test):
        df["datetime"] = pd.to_datetime(df["timeStamp"], unit="s")

        df["year"]       = df["datetime"].dt.year
        df["month"]      = df["datetime"].dt.month
        df["day"]        = df["datetime"].dt.day
        df["hour"]       = df["datetime"].dt.hour
        df["dayofweek"]  = df["datetime"].dt.dayofweek
        df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype("int8")

        add_cyclic_features(df, "hour",      24)
        add_cyclic_features(df, "dayofweek",  7)
        add_cyclic_features(df, "month",     12)

        df["is_night"]   = df["hour"].between(0, 5).astype("int8")
        df["is_evening"] = df["hour"].between(18, 23).astype("int8")

    t0 = train["datetime"].min()
    for df in (train, test):
        df["days_since_start"] = (df["datetime"] - t0).dt.total_seconds() / 86_400

    train.drop(["timeStamp", "datetime"], axis=1, inplace=True)
    test.drop(["timeStamp", "datetime"],   axis=1, inplace=True)

    # 3. High-cardinality encodings
    hi_card_cols = ["hashedRefererDeepThree", "browserVersion"]
    for col in hi_card_cols:
        tr_freq, te_freq = frequency_encode(train[col], test[col], min_count=20)
        train[f"{col}_freq"] = tr_freq
        test[f"{col}_freq"]  = te_freq

        train, test = add_count_feature(train, test, col)

        if train[col].nunique() > 1_000:
            train.drop(columns=[col], inplace=True)
            test.drop(columns=[col], inplace=True)

    # 4. Target mean encoding (placementId, websiteId)
    train, test = add_target_mean_encoding(
        train, test,
        cols=["placementId", "websiteId"],
        target="isSold",
        n_splits=5,
        smoothing=20
    )

    # Facultatif : on peut aussi dropper les brutes si trop volumineuses
    train.drop(columns=["placementId", "websiteId"], inplace=True)
    test.drop(columns=["placementId", "websiteId"],  inplace=True)

    # 5. Identifiants
    train.drop(["auctionId"], axis=1, inplace=True)
    test.drop(["auctionId"],  axis=1, inplace=True)

    # 6. Target / features
    y = train["isSold"].copy()
    X = train.drop("isSold", axis=1)

    # 7. One-hot des petites catégorielles
    small_cat = [
        "country", "opeartingSystem", "browser", "device",
        "environmentType", "articleSafenessCategorization",
        "dayofweek", "month"
    ]
    small_cat = [c for c in small_cat if c in X.columns]

    X_test_merge = pd.concat([X, test], axis=0, sort=False)
    X_test_merge = pd.get_dummies(
        X_test_merge,
        columns=small_cat,
        dummy_na=False
    )

    X_enc    = X_test_merge.iloc[: len(X), :].reset_index(drop=True)
    test_enc = X_test_merge.iloc[len(X):, :].reset_index(drop=True)

    # 8. Train / validation split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_enc, y,
        test_size=val_size,
        random_state=random_state,
        stratify=y
    )

    print(f"🏷️  Train shape : {X_tr.shape}")
    print(f"🏷️  Val   shape : {X_val.shape}")
    print(f"🏷️  Test  shape : {test_enc.shape}")

    return X_tr, X_val, y_tr, y_val, test_enc, test_ids

## XGBoost with optuna

In [13]:
##############################################################################
# XGBoost + Optuna – 1 M lignes  • RTX 4060  • objectif F1
#  • échantillon 30 % (≈300 k) pour le tuning
#  • 50 essais  • Successive-Halving  • 5-fold CV
#  • borne haute 8 000 arbres  • early-stop 150
##############################################################################

# ------------------------------------------------------------------#
# 1. Pré-traitement                                                 #
# ------------------------------------------------------------------#
print("📦 Chargement des données...")
X_tr, X_val, y_tr, y_val, X_test, test_ids = preprocess("train.csv", "test.csv", val_size=0.2, random_state=42)
X_tr  = pd.concat([X_tr,  X_val], ignore_index=True)   # full train
y_tr  = pd.concat([y_tr,  y_val], ignore_index=True)
print("✅ Données chargées et concaténées.")

# ------------------------------------------------------------------#
# 2. Conversion numérique                                           #
# ------------------------------------------------------------------#
print("🔢 Conversion numérique...")
def make_numeric(df):
    df = df.copy()
    cats = df.select_dtypes(["object", "category"]).columns
    df[cats] = df[cats].astype("category").apply(lambda s: s.cat.codes).astype("int32")
    df[df.select_dtypes("bool").columns] = df.select_dtypes("bool").astype("uint8")
    return df.astype("float32")

X_tr_n, X_test_n = map(make_numeric, [X_tr, X_test])
print("✅ Conversion terminée.")

# ------------------------------------------------------------------#
# 3. Sous-échantillon 30 % pour Optuna                              #
# ------------------------------------------------------------------#
print("🎯 Création sous-échantillon pour tuning...")
sub_idx = y_tr.sample(frac=0.3, random_state=42).index
X_sub, y_sub = X_tr_n.loc[sub_idx], y_tr.loc[sub_idx]
spw = (len(y_sub) - y_sub.sum()) / y_sub.sum()
print(f"✅ Sous-échantillon prêt — Taille : {len(X_sub)}")

# ------------------------------------------------------------------#
# 4. Objective Optuna 3-fold CV + pruning                           #
# ------------------------------------------------------------------#
pruner = SuccessiveHalvingPruner(min_resource=200, reduction_factor=3)

print("🕵️ Détection GPU...")
def detect_gpu_available():
    try:
        result = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return result.returncode == 0
    except FileNotFoundError:
        return False

def save_best_trial_if_needed(best_f1, trial, iters, best_t, path="best_xgb_params.json"):
    if os.path.exists(path):
        with open(path, "r") as f:
            best_so_far = json.load(f).get("f1", 0.0)
    else:
        best_so_far = 0.0

    if best_f1 > best_so_far:
        with open(path, "w") as f:
            json.dump({
                "f1": float(best_f1),
                "params": trial.params,
                "best_iter": int(np.max(iters)),
                "best_thresh": float(best_t)
            }, f, indent=2)
        print(f"💾 Nouveau meilleur F1 = {best_f1:.4f} — sauvegardé.")


tree_method = "gpu_hist" if detect_gpu_available() else "hist"
predictor = "gpu_predictor" if tree_method == "gpu_hist" else "cpu_predictor"
print("⚙️ Méthode d'arbre sélectionnée :", tree_method)
print("⚙️ Prédicteur sélectionné        :", predictor)

def objective(trial):
    print(f"\n🔬 Début essai Optuna n°{trial.number}")
    params = {
        "objective": trial.suggest_categorical("objective", ["binary:logitraw", "binary:logistic"]),
        "eval_metric": "auc",
        "tree_method": tree_method,
        "predictor": predictor,
        "eta": trial.suggest_float("eta", 0.005, 0.15, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "max_leaves": trial.suggest_int("max_leaves", 64, 512),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "max_bin": trial.suggest_categorical("max_bin", [64, 128, 256, 512]),
        "min_child_weight": trial.suggest_float("min_child_weight", 0.0, 20.0),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 2),
        "lambda": trial.suggest_float("lambda", 1e-3, 100.0, log=True),
        "alpha":  trial.suggest_float("alpha",  1e-3, 30.0,  log=True),
        "subsample": trial.suggest_float("subsample", 0.3, 0.8),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.8),
        "gamma": trial.suggest_float("gamma", 1e-8, 5.0, log=True),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 20.0, log=True),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    preds, truths, iters = [], [], []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_sub, y_sub)):
        print(f"  ➤ Fold {fold + 1}/5 — entraînement en cours...")
        dtr = xgb.DMatrix(X_sub.iloc[tr_idx], label=y_sub.iloc[tr_idx])
        dva = xgb.DMatrix(X_sub.iloc[va_idx], label=y_sub.iloc[va_idx])

        cbs = [xgb.callback.EarlyStopping(rounds=150)]
        if fold == 0:
            cbs.append(XGBoostPruningCallback(trial, "val-auc"))

        bst = xgb.train(
            params, dtr,
            num_boost_round=8000,
            evals=[(dva, "val")],
            callbacks=cbs,
            verbose_eval=False
        )

        print(f"     ✅ Fold {fold + 1} terminé — Best iteration: {bst.best_iteration}")

        preds.append(bst.predict(dva, iteration_range=(0, bst.best_iteration)))
        truths.append(y_sub.iloc[va_idx].values)
        iters.append(bst.best_iteration)

    preds = np.concatenate(preds)
    truths = np.concatenate(truths)

    th_grid = np.linspace(0.25, 0.75, 21)
    f1_scores = [f1_score(truths, preds >= t) for t in th_grid]
    best_f1 = max(f1_scores)
    best_t = th_grid[np.argmax(f1_scores)]

    trial.set_user_attr("best_iter", int(np.max(iters)))
    trial.set_user_attr("best_thresh", float(best_t))

    save_best_trial_if_needed(best_f1, trial, iters, best_t)
    print(f"🏁 Fin essai Optuna {trial.number} — F1 = {best_f1:.4f}")
    
    return best_f1



study = optuna.create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(min_resource=200, reduction_factor=3),
    study_name="xgb_tuning",
    storage="sqlite:///optuna_xgb.db",  # fichier local
    load_if_exists=True
)
optuna.visualization.plot_optimization_history(study).show()

TOTAL_TRIALS = 20
news = 0
done = len(study.trials)
remaining = TOTAL_TRIALS - done + news

if remaining <= 0:
    print(f"✅ Tous les {TOTAL_TRIALS} essais ont déjà été faits.")
else:
    print(f"⏳ Reprise : {done} essais faits, encore {remaining} à faire.")
    study.optimize(objective, n_trials=remaining, show_progress_bar=True)

best_params = study.best_params.copy()
best_iter   = int(study.best_trial.user_attrs["best_iter"])
best_thresh = study.best_trial.user_attrs["best_thresh"]
best_params.update({"tree_method": tree_method, "predictor": predictor})

print(f"🏆  F1 CV = {study.best_value:.4f} | iter ≈ {best_iter} | thr = {best_thresh:.3f}")

# ------------------------------------------------------------------#
# 5. Entraînement final par CV + analyse avancée                #
# ------------------------------------------------------------------#

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1s, thresholds, all_cv_models = [], [], []

for fold, (tr_idx, val_idx) in enumerate(cv.split(X_tr_n, y_tr)):
    print(f"\n🔁 Fold {fold + 1}/5")
    dtr = xgb.DMatrix(X_tr_n.iloc[tr_idx], label=y_tr.iloc[tr_idx])
    dva = xgb.DMatrix(X_tr_n.iloc[val_idx], label=y_tr.iloc[val_idx])

    bst = xgb.train(
        best_params,
        dtr,
        num_boost_round=best_iter,
        verbose_eval=False
    )

    all_cv_models.append(bst)

    # Importance features (affichée sur le 1er fold uniquement)
    if fold == 0:
        xgb.plot_importance(bst, max_num_features=20)
        plt.title("Feature importance (Fold 1)")
        plt.show()

    proba = bst.predict(dva, iteration_range=(0, best_iter))
    if best_params.get("objective") == "binary:logitraw":
        proba = 1 / (1 + np.exp(-proba))

    # Seuil optimal par fold
    th_grid = np.linspace(0.25, 0.75, 21)
    f1_grid = [f1_score(y_tr.iloc[val_idx], proba >= t) for t in th_grid]
    best_t = th_grid[np.argmax(f1_grid)]
    thresholds.append(best_t)

    labels = (proba >= best_t).astype(int)
    score = f1_score(y_tr.iloc[val_idx], labels)
    f1s.append(score)

    # Analyse des erreurs (matrice de confusion)
    cm = confusion_matrix(y_tr.iloc[val_idx], labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix Fold {fold + 1}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    print(classification_report(y_tr.iloc[val_idx], labels))

mean_f1 = np.mean(f1s)
std_f1 = np.std(f1s)
print(f"\n📊 F1 moyen sur 5 folds = {mean_f1:.4f} ± {std_f1:.4f}")

# ------------------------------------------------------------------#
# 6. Prédiction test : moyennage sur les folds                     #
# ------------------------------------------------------------------#

# Prédiction test moyennée sur tous les folds pondérés par F1
weights = np.array(f1s) / np.sum(f1s)
cv_preds = sum(
    model.predict(xgb.DMatrix(X_test_n), iteration_range=(0, best_iter)) * w
    for model, w in zip(all_cv_models, weights)
)

# Seuil moyen (ou tu peux prendre la médiane)
best_thresh_cv = np.mean(thresholds)

if best_params.get("objective") == "binary:logitraw":
    cv_preds = 1 / (1 + np.exp(-cv_preds))

labels = (cv_preds >= best_thresh_cv).astype(int)

pd.DataFrame({"auctionId": test_ids, "isSold": labels}).to_csv("submission_xgb_optuna_cv_avg.csv", index=False)
print("✅ submission_xgb_optuna_cv_avg.csv enregistré")


📦 Chargement des données...
🏷️  Train shape : (773865, 403)
🏷️  Val   shape : (193467, 403)
🏷️  Test  shape : (242171, 403)
✅ Données chargées et concaténées.
🔢 Conversion numérique...
✅ Conversion terminée.
🎯 Création sous-échantillon pour tuning...
✅ Sous-échantillon prêt — Taille : 290200
🕵️ Détection GPU...
⚙️ Méthode d'arbre sélectionnée : hist
⚙️ Prédicteur sélectionné        : cpu_predictor


[I 2025-06-05 01:38:36,672] Using an existing study with name 'xgb_tuning' instead of creating a new one.
[W 2025-06-05 01:38:37,861] There are no complete trials.


⏳ Reprise : 1 essais faits, encore 19 à faire.


  0%|          | 0/19 [00:00<?, ?it/s]


🔬 Début essai Optuna n°1
  ➤ Fold 1/5 — entraînement en cours...
[W 2025-06-05 01:48:09,504] Trial 1 failed with parameters: {'objective': 'binary:logitraw', 'eta': 0.03155463103012829, 'max_depth': 10, 'max_leaves': 332, 'grow_policy': 'lossguide', 'max_bin': 256, 'min_child_weight': 4.236568067545283, 'max_delta_step': 2, 'lambda': 0.00111185336801772, 'alpha': 0.19776678488313137, 'subsample': 0.35272562550916, 'colsample_bytree': 0.33361321897636215, 'gamma': 3.795709855695428e-07, 'scale_pos_weight': 0.8660075758935225} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\lucas\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\lucas\AppData\Local\Temp\ipykernel_13848\4116761847.py", line 110, in objective
    bst = xgb.train(
  File "c:\Users\lucas\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py"

KeyboardInterrupt: 