In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [14]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('testSubmissionFile.csv')

# Display first few rows
train.head()

Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold
0,001ed16b-dd08-4599-b8ef-4f56a373c454_6e5f1087-...,1603815466,120706,68203,1ae7c2d3c28b711c072d8e2eb3869fa59090669bdc153e...,US,Windows,Chrome,86_0,PC,js-web,2,safe,False
1,0024b36a-4fb5-4070-88fb-fc0bfb1909ed,1603974586,69454,42543,df1108bf6ae49dbccf5eab60ff9d04a6a09dda60ec7290...,RO,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False
2,003630fa-ad63-4283-be1b-141670132d70_f37c2b23-...,1604229969,100170,57703,cc6957e8aec85a4d920991c53874c5d0780bbfbd469802...,UK,Android,Facebook App,294_0,Phone,js-web,2,safe,True
3,0048c65a-ce76-43ba-98d2-8e87607468f8,1604156610,100446,57797,7fc0bb7a65d074e003cce786cda2b070f80dd47179c4b9...,ES,Android,Chrome Mobile,86_0,Phone,js-ampsf,1,safe,True
4,0056b8a7-54f9-4ac8-8d50-f725bf377872,1604004493,119517,67613,3a6552ccbf66ad166aa9005c3e08f70716abd676cfd87b...,FR,Android,Facebook App,293_0,Phone,js-fbwv,1,unsafe,False


In [15]:
# ==============================================================
# preprocessing_optimized.py
# ==============================================================
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold, train_test_split

# --------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------
def add_cyclic_features(df, col, period):
    df[f"{col}_sin"] = np.sin(2 * np.pi * df[col] / period)
    df[f"{col}_cos"] = np.cos(2 * np.pi * df[col] / period)
    return df

def frequency_encode(train_col, test_col, min_count=10):
    freq = train_col.value_counts()
    freq[freq < min_count] = min_count
    freq = freq / freq.sum()
    return train_col.map(freq).fillna(0), test_col.map(freq).fillna(0)

def add_count_feature(train_df, test_df, col):
    cnt = train_df[col].value_counts()
    train_df[f"{col}_cnt"] = train_df[col].map(cnt).fillna(1)
    test_df[f"{col}_cnt"]  = test_df[col].map(cnt).fillna(1)
    return train_df, test_df

def add_target_mean_encoding(train_df, test_df, cols, target="isSold",
                             n_splits=5, smoothing=20):
    global_mean = train_df[target].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for col in cols:
        oof = pd.Series(index=train_df.index, dtype=float)

        for tr_idx, val_idx in kf.split(train_df):
            tr, val = train_df.iloc[tr_idx], train_df.iloc[val_idx]
            stats = tr.groupby(col)[target].agg(["mean", "count"])
            smooth = (stats["count"]*stats["mean"] + smoothing*global_mean) / \
                     (stats["count"] + smoothing)
            oof.iloc[val_idx] = val[col].map(smooth).fillna(global_mean)

        train_df[f"{col}_te"] = oof

        full_stats = train_df.groupby(col)[target].agg(["mean", "count"])
        smooth_full = (full_stats["count"]*full_stats["mean"] + smoothing*global_mean) / \
                      (full_stats["count"] + smoothing)
        test_df[f"{col}_te"] = test_df[col].map(smooth_full).fillna(global_mean)

    return train_df, test_df

# --------------------------------------------------------------------
# Main preprocessing
# --------------------------------------------------------------------
def preprocess(train_path: str | Path,
               test_path: str | Path,
               val_size: float = 0.2,
               random_state: int = 42):
    # 1. Load
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    test_ids = test["auctionId"].copy()

    # 2. Time features
    for df in (train, test):
        df["datetime"] = pd.to_datetime(df["timeStamp"], unit="s")

        df["year"]       = df["datetime"].dt.year
        df["month"]      = df["datetime"].dt.month
        df["day"]        = df["datetime"].dt.day
        df["hour"]       = df["datetime"].dt.hour
        df["dayofweek"]  = df["datetime"].dt.dayofweek
        df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype("int8")

        add_cyclic_features(df, "hour",      24)
        add_cyclic_features(df, "dayofweek",  7)
        add_cyclic_features(df, "month",     12)

        df["is_night"]   = df["hour"].between(0, 5).astype("int8")
        df["is_evening"] = df["hour"].between(18, 23).astype("int8")

    t0 = train["datetime"].min()
    for df in (train, test):
        df["days_since_start"] = (df["datetime"] - t0).dt.total_seconds() / 86_400

    train.drop(["timeStamp", "datetime"], axis=1, inplace=True)
    test.drop(["timeStamp", "datetime"],   axis=1, inplace=True)

    # 3. High-cardinality encodings
    hi_card_cols = ["hashedRefererDeepThree", "browserVersion"]
    for col in hi_card_cols:
        tr_freq, te_freq = frequency_encode(train[col], test[col], min_count=20)
        train[f"{col}_freq"] = tr_freq
        test[f"{col}_freq"]  = te_freq

        train, test = add_count_feature(train, test, col)

        if train[col].nunique() > 1_000:
            train.drop(columns=[col], inplace=True)
            test.drop(columns=[col], inplace=True)

    # 4. Target mean encoding (placementId, websiteId)
    train, test = add_target_mean_encoding(
        train, test,
        cols=["placementId", "websiteId"],
        target="isSold",
        n_splits=5,
        smoothing=20
    )

    # Facultatif : on peut aussi dropper les brutes si trop volumineuses
    train.drop(columns=["placementId", "websiteId"], inplace=True)
    test.drop(columns=["placementId", "websiteId"],  inplace=True)

    # 5. Identifiants
    train.drop(["auctionId"], axis=1, inplace=True)
    test.drop(["auctionId"],  axis=1, inplace=True)

    # 6. Target / features
    y = train["isSold"].copy()
    X = train.drop("isSold", axis=1)

    # 7. One-hot des petites catégorielles
    small_cat = [
        "country", "opeartingSystem", "browser", "device",
        "environmentType", "articleSafenessCategorization",
        "dayofweek", "month"
    ]
    small_cat = [c for c in small_cat if c in X.columns]

    X_test_merge = pd.concat([X, test], axis=0, sort=False)
    X_test_merge = pd.get_dummies(
        X_test_merge,
        columns=small_cat,
        dummy_na=False
    )

    X_enc    = X_test_merge.iloc[: len(X), :].reset_index(drop=True)
    test_enc = X_test_merge.iloc[len(X):, :].reset_index(drop=True)

    # 8. Train / validation split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_enc, y,
        test_size=val_size,
        random_state=random_state,
        stratify=y
    )

    print(f"🏷️  Train shape : {X_tr.shape}")
    print(f"🏷️  Val   shape : {X_val.shape}")
    print(f"🏷️  Test  shape : {test_enc.shape}")

    return X_tr, X_val, y_tr, y_val, test_enc, test_ids

# --------------------------------------------------------------------
# Example usage
# --------------------------------------------------------------------
if __name__ == "__main__":
    X_train, X_val, y_train, y_val, X_test, test_ids = preprocess(
        "train.csv",
        "test.csv",
        val_size=0.2,
        random_state=42
    )

    # Vous pouvez ensuite entraîner votre modèle :
    #  from lightgbm import LGBMClassifier
    #  model = LGBMClassifier(...)
    #  model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="auc", ...)

X_train, X_val, y_train, y_val, X_test, test_ids = preprocess(
    "train.csv",
    "test.csv",
    val_size=0.2,
    random_state=42
)

🏷️  Train shape : (773865, 403)
🏷️  Val   shape : (193467, 403)
🏷️  Test  shape : (242171, 403)
🏷️  Train shape : (773865, 403)
🏷️  Val   shape : (193467, 403)
🏷️  Test  shape : (242171, 403)


In [None]:
import xgboost as xgb
import pandas as pd

# -----------------------------------------------------------------
# 👇 Conversion des dtypes pour XGBoost « legacy »
# -----------------------------------------------------------------
def make_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Remplace category/object/bool par des int/float consommables par XGB."""
    df = df.copy()

    # 1. Category / object  →  int32 (codes)
    cat_cols = df.select_dtypes(include=["category", "object"]).columns
    for col in cat_cols:
        df[col] = df[col].astype("category").cat.codes.astype("int32")

    # 2. Bool → uint8
    bool_cols = df.select_dtypes(include=["bool"]).columns
    if len(bool_cols):
        df[bool_cols] = df[bool_cols].astype("uint8")

    # 3. Tout le reste en float32 (plus compact)
    return df.astype("float32")

X_train_num = make_numeric(X_train)
X_val_num   = make_numeric(X_val)
X_test_num  = make_numeric(X_test)

# -----------------------------------------------------------------
#  1. DMatrix 
# -----------------------------------------------------------------
dtrain = xgb.DMatrix(X_train_num.values, label=y_train.values)
dval   = xgb.DMatrix(X_val_num.values,   label=y_val.values)
dtest  = xgb.DMatrix(X_test_num.values)

# -----------------------------------------------------------------
# 2. Paramètres
# -----------------------------------------------------------------
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos

params = {
    "objective":       "binary:logistic",
    "eval_metric":     "auc",
    "eta":             0.02,     # learning_rate
    "max_depth":       8,
    "min_child_weight":3,
    "gamma":           0.1,
    "subsample":       0.8,
    "colsample_bytree":0.8,
    "lambda":          10,       # L2
    "alpha":           1,        # L1
    "scale_pos_weight":scale_pos_weight,
    "max_bin":         256,
    # CPU hist; si vous avez un GPU mais ancienne version, remplacez par:
    # "tree_method": "gpu_hist"
}

# -----------------------------------------------------------------
# 3. Entraînement avec early stopping
# -----------------------------------------------------------------
evals = [(dtrain, "train"), (dval, "val")]
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=8000,          # borne haute
    evals=evals,
    early_stopping_rounds=300,
    verbose_eval=200               # AUC affichée toutes les 200 it.
)

print(f"Best iteration : {bst.best_iteration}")
print(f"Best AUC (val) : {bst.best_score:.5f}")

[0]	train-auc:0.82723	val-auc:0.82293
[200]	train-auc:0.85744	val-auc:0.85285
[400]	train-auc:0.86197	val-auc:0.85699
[600]	train-auc:0.86483	val-auc:0.85951
[800]	train-auc:0.86692	val-auc:0.86128
[1000]	train-auc:0.86885	val-auc:0.86279
[1200]	train-auc:0.87044	val-auc:0.86401
[1400]	train-auc:0.87202	val-auc:0.86513
[1600]	train-auc:0.87349	val-auc:0.86613
[1800]	train-auc:0.87486	val-auc:0.86702
[2000]	train-auc:0.87624	val-auc:0.86788
[2200]	train-auc:0.87740	val-auc:0.86859
[2400]	train-auc:0.87863	val-auc:0.86926
[2600]	train-auc:0.87982	val-auc:0.86992
[2800]	train-auc:0.88093	val-auc:0.87048
[3000]	train-auc:0.88209	val-auc:0.87106
[3200]	train-auc:0.88321	val-auc:0.87157
[3400]	train-auc:0.88444	val-auc:0.87210
[3600]	train-auc:0.88553	val-auc:0.87256
[3800]	train-auc:0.88663	val-auc:0.87297
[4000]	train-auc:0.88767	val-auc:0.87336
[4200]	train-auc:0.88868	val-auc:0.87370
[4400]	train-auc:0.88973	val-auc:0.87402
[4600]	train-auc:0.89079	val-auc:0.87431
[4800]	train-auc:0.8918

AttributeError: 'Booster' object has no attribute 'best_ntree_limit'

In [21]:
X_full_num  = make_numeric(pd.concat([X_train, X_val]))
y_full      = pd.concat([y_train, y_val])

dtrain_full = xgb.DMatrix(X_full_num.values, label=y_full.values)

final_model = xgb.train(
    params=params,              # les mêmes qu'avant
    dtrain=dtrain_full,
    num_boost_round=bst.best_iteration   # 7967
)

final_proba = final_model.predict(dtest)   # pas besoin d’ntree_limit

pd.DataFrame({
    "auctionId": test_ids,
    "isSold": final_proba
}).to_csv("submission_xgb_full.csv", index=False)
print("✅ submission_xgb_full.csv enregistré")


✅ submission_xgb_full.csv enregistré


In [22]:
# --- binarisation simple à 0.5 ----------------------------------
final_pred = (final_proba >= 0.5).astype(int)

submission_lbl = pd.DataFrame({
    "auctionId": test_ids,
    "isSold": final_pred
})
submission_lbl.to_csv("submission_xgb_labels.csv", index=False)
print("✅ submission_xgb_labels.csv enregistré (0/1)")


✅ submission_xgb_labels.csv enregistré (0/1)


In [19]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# ---------- DMatrix prêts (numériques) ----------
dtrain = xgb.DMatrix(X_train_num, label=y_train)
dval   = xgb.DMatrix(X_val_num,   label=y_val)

def objective(trial):
    # --- espace de recherche ---
    param = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",          # ou 'hist'
        "eta": trial.suggest_float("eta", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        # 🔧 gamma : borne basse > 0
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        # --- le reste inchangé ---
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 100.0, log=True),
        "alpha":  trial.suggest_float("alpha", 1e-3, 10.0,  log=True),
        "max_bin": trial.suggest_categorical("max_bin", [128, 256, 512]),
        "scale_pos_weight": (len(y_train)-y_train.sum())/y_train.sum(),
    }

    # --- entraînement avec early-stopping ---
    bst = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=6_000,
        evals=[(dval, "val")],
        early_stopping_rounds=300,
        verbose_eval=False      # silence pendant l’optim
    )

    return bst.best_score       # Optuna minimise, donc on retournera -AUC

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=60, timeout=3_600)   # 60 essais ou 1 h

print("Best AUC :", study.best_value)
print("Best params :")
for k, v in study.best_params.items():
    print(f"  {k} = {v}")


[I 2025-06-04 13:02:28,508] A new study created in memory with name: no-name-ffe268f9-b977-48c0-8e8b-cc6d6b02eb99

    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()
[I 2025-06-04 13:03:44,064] Trial 0 finished with value: 0.8727766750378203 and parameters: {'eta': 0.16433679256475983, 'max_depth': 6, 'min_child_weight': 6, 'gamma': 1.9288060215496997e-08, 'subsample': 0.5372422973343957, 'colsample_bytree': 0.8080561814592002, 'lambda': 0.6811936303317784, 'alpha': 0.0015762625805149618, 'max_bin': 256}. Best is trial 0 with value: 0.8727766750378203.

    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()
[I 2025-06-04 13:08:26,356] Trial 1 finished with value: 0.8771113891010199 and parameters: {'eta': 0.020335185302104927, 'max_depth': 10, 'min_child_weight': 4, 'gamma': 0.10491327899071787, 'subsample': 0.9223307554707358, 'colsample_bytree': 0.9265845752307738, 'lambda': 23.49996780029883,

KeyboardInterrupt: 