In [None]:
# ===========================================
# kaggle_pipeline_lgb_optuna.py
# ===========================================
# ▶ Avant de lancer :
#   pip install lightgbm optuna category_encoders
#   (GPU : pip install lightgbm-gpu)
# ===========================================

import os
import warnings
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import optuna.integration.lightgbm as lgb_optuna
import lightgbm as lgb
# import category_encoders as ce   # pour le frequency encoding

warnings.filterwarnings("ignore")

# ------------------------------------------------------------------------------
# 1. Chargement
# ------------------------------------------------------------------------------
DATA_DIR = Path(".")        # ajuste si nécessaire
TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV  = DATA_DIR / "test.csv"
SUB_CSV   = DATA_DIR / "testSubmissionFile.csv"

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# ------------------------------------------------------------------------------
# 2. Feature engineering
# ------------------------------------------------------------------------------
def add_datetime_features(df: pd.DataFrame, ts_col: str = "timeStamp") -> pd.DataFrame:
    """Ajoute les features calendaires + cycliques, puis supprime la colonne brute."""
    dt = pd.to_datetime(df[ts_col], unit="s")
    df["year"]       = dt.dt.year
    df["month"]      = dt.dt.month
    df["day"]        = dt.dt.day
    df["hour"]       = dt.dt.hour
    df["dayofweek"]  = dt.dt.dayofweek          # 0 = lundi
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype("int8")

    # encode cyclique (sin/cos) pour month / dayofweek / hour
    df["hour_sin"]  = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"]  = np.cos(2 * np.pi * df["hour"] / 24)

    df["dow_sin"]   = np.sin(2 * np.pi * df["dayofweek"] / 7)
    df["dow_cos"]   = np.cos(2 * np.pi * df["dayofweek"] / 7)

    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

    return df.drop(columns=[ts_col])


def frequency_encode(train_series: pd.Series,
                     test_series: pd.Series,
                     min_count: int = 10) -> tuple[pd.Series, pd.Series]:
    """
    Encode une variable catégorielle par la fréquence de chaque modalité.
    Pour les modalités très rares (< min_count) on met la même valeur.
    """
    freq = train_series.value_counts()
    rare_mask = freq < min_count
    freq[rare_mask] = min_count          # lissage simple
    freq_enc = freq / freq.sum()

    return train_series.map(freq_enc).fillna(0), test_series.map(freq_enc).fillna(0)


def preprocess(train_df: pd.DataFrame, test_df: pd.DataFrame):
    """ Nettoyage, features, encodage. Retour : X_train, X_test, y, cat_features """
    # --- cibles et identifiants ------------------------------------------------
    y = train_df["isSold"].copy()
    train_df = train_df.drop(columns=["isSold"])

    test_ids = test_df["auctionId"].copy()

    # Conserver auctionId pour soumission uniquement
    train_df = train_df.drop(columns=["auctionId"])
    test_df  = test_df.drop(columns=["auctionId"])

    # ----------------------------------------------------------------------------
    # 2.1. Features temporelles
    train_df = add_datetime_features(train_df)
    test_df  = add_datetime_features(test_df)

    # ----------------------------------------------------------------------------
    # 2.2. Large cardinality : frequency encoding
    hi_card_cols = [
        "hashedRefererDeepThree",
        "placementId",
        "websiteId",
        "browserVersion"
    ]

    for col in hi_card_cols:
        if col in train_df.columns:
            tr_enc, te_enc = frequency_encode(train_df[col], test_df[col])
            train_df[f"{col}_freq"] = tr_enc
            test_df[f"{col}_freq"]  = te_enc

            # on retire la version brute (si on ne veut pas la passer en cat)
            train_df = train_df.drop(columns=[col])
            test_df  = test_df.drop(columns=[col])

    # ----------------------------------------------------------------------------
    # 2.3. Colonnes catégorielles « propres » (faible / moyenne cardinalité)
    cat_cols = [
        "country",
        "opeartingSystem",
        "browser",
        "device",
        "environmentType",
        "articleSafenessCategorization",
        "year",        # même année ? on peut laisser en num ou cat
        # month/day/hour seraient plutôt conservés comme num + cyc
    ]
    cat_cols = [c for c in cat_cols if c in train_df.columns]

    # LightGBM veut dtype category
    for col in cat_cols:
        train_df[col] = train_df[col].astype("category")
        test_df[col]  = test_df[col].astype("category")

    # ----------------------------------------------------------------------------
    # 2.4. Normalisation éventuelle (pas nécessaire pour LightGBM)
    # ----------------------------------------------------------------------------
    print(f"Features finales : {train_df.shape[1]}")

    return train_df, test_df, y, test_ids, cat_cols


X_train_full, X_test_full, y_full, test_ids, cat_features = preprocess(train.copy(),
                                                                       test.copy())

# ------------------------------------------------------------------------------
# 3. Split train/validation
# ------------------------------------------------------------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_full,
    y_full,
    test_size=0.2,
    random_state=42,
    stratify=y_full
)

# ------------------------------------------------------------------------------
# 4. Optuna : LightGBM hyper-parameter search
# ------------------------------------------------------------------------------
print("\n=== Optuna tuning LightGBM ===")

# LightGBMTunerCV gère la CV interne + early stopping
lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=cat_features, free_raw_data=False)
lgb_valid = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=cat_features,
                        free_raw_data=False)

base_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "feature_pre_filter": False,     # garder toutes les features
    "seed": 42,
    "n_jobs": -1,
    # GPU : décommente si dispo
    # "device": "gpu",
    # "gpu_platform_id": 0,
    # "gpu_device_id": 0,
}

from lightgbm import early_stopping, log_evaluation

tuner = lgb_optuna.LightGBMTunerCV(
    params        = base_params,
    train_set     = lgb_train,
    folds         = StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    num_boost_round = 10_000,
    callbacks     = [
        early_stopping(stopping_rounds=200),
        log_evaluation(period=200),
    ],
)

tuner.run()            # recherche bayésienne

best_params = tuner.best_params
best_score  = tuner.best_score
print(f"Best AUC (CV) : {best_score:.5f}")
print("Best params :", best_params)

# ------------------------------------------------------------------------------
# 5. Ré-entraînement sur le full training set avec les meilleurs paramètres
# ------------------------------------------------------------------------------
best_params.update({"metric": "auc", "objective": "binary", "verbosity": -1})
full_dataset = lgb.Dataset(X_train_full, y_full, categorical_feature=cat_features)

final_model = lgb.train(
    best_params,
    full_dataset,
    num_boost_round = tuner.best_iteration
)

# ------------------------------------------------------------------------------
# 6. Validation offline (val set)
# ------------------------------------------------------------------------------
val_pred = final_model.predict(X_val)
offline_auc = roc_auc_score(y_val, val_pred)
print(f"Validation AUC (hold-out) : {offline_auc:.5f}")

# ------------------------------------------------------------------------------
# 7. Prédiction test + soumission
# ------------------------------------------------------------------------------
test_pred = final_model.predict(X_test_full)
submission = pd.DataFrame({"auctionId": test_ids, "isSold": test_pred})
SUB_PATH = DATA_DIR / "submission_lgb_optuna.csv"
submission.to_csv(SUB_PATH, index=False)
print(f"✅ Submission sauvegardée → {SUB_PATH.resolve()}")


Features finales : 23


[I 2025-06-04 11:42:34,288] A new study created in memory with name: no-name-b150637d-1a5e-4902-afd5-c69db42127b4



=== Optuna tuning LightGBM ===


feature_fraction, val_score: -inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.843585 + 0.000480936
[400]	valid's auc: 0.85569 + 0.000954555
[600]	valid's auc: 0.861167 + 0.000993331
[800]	valid's auc: 0.864367 + 0.00107739
[1000]	valid's auc: 0.866463 + 0.00103248
[1200]	valid's auc: 0.868093 + 0.0011511
[1400]	valid's auc: 0.869222 + 0.00108625
[1600]	valid's auc: 0.870049 + 0.00104987
[1800]	valid's auc: 0.870748 + 0.000974653
[2000]	valid's auc: 0.871347 + 0.000944506
[2200]	valid's auc: 0.871804 + 0.000928164
[2400]	valid's auc: 0.872225 + 0.000900202
[2600]	valid's auc: 0.872535 + 0.000925801
[2800]	valid's auc: 0.872812 + 0.00086406
[3000]	valid's auc: 0.873074 + 0.000845381
[3200]	valid's auc: 0.873231 + 0.00084837
[3400]	valid's auc: 0.873409 + 0.000812699
[3600]	valid's auc: 0.873578 + 0.000767851
[3800]	valid's auc: 0.873701 + 0.000768022
[4000]	valid's auc: 0.873814 + 0.000784611
[4200]	valid's auc: 0.873865 + 0.000858777
[4400]	valid's auc: 0.873915 + 0.000878409
[4600

feature_fraction, val_score: 0.874107:  14%|#4        | 1/7 [04:31<27:09, 271.51s/it][I 2025-06-04 11:47:05,799] Trial 0 finished with value: 0.8741071732770184 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.8741071732770184.
feature_fraction, val_score: 0.874107:  14%|#4        | 1/7 [04:31<27:09, 271.51s/it]

Early stopping, best iteration is:
[5390]	valid's auc: 0.874107 + 0.000861722
Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.842124 + 0.000979402
[400]	valid's auc: 0.854267 + 0.0013537
[600]	valid's auc: 0.859809 + 0.00114546
[800]	valid's auc: 0.863031 + 0.00103116
[1000]	valid's auc: 0.865198 + 0.000912375
[1200]	valid's auc: 0.86687 + 0.000905311
[1400]	valid's auc: 0.867955 + 0.000840021
[1600]	valid's auc: 0.86899 + 0.000808273
[1800]	valid's auc: 0.869695 + 0.00074907
[2000]	valid's auc: 0.870394 + 0.000758287
[2200]	valid's auc: 0.870947 + 0.000762677
[2400]	valid's auc: 0.871488 + 0.000862418
[2600]	valid's auc: 0.871869 + 0.000876528
[2800]	valid's auc: 0.872253 + 0.000892153
[3000]	valid's auc: 0.872501 + 0.000836589
[3200]	valid's auc: 0.872735 + 0.00080807
[3400]	valid's auc: 0.872929 + 0.000827841
[3600]	valid's auc: 0.873077 + 0.000794973
[3800]	valid's auc: 0.873228 + 0.000808616
[4000]	valid's auc: 0.873334 + 0.000818302
[4200]	valid

feature_fraction, val_score: 0.874107:  29%|##8       | 2/7 [10:33<27:02, 324.60s/it][I 2025-06-04 11:53:07,565] Trial 1 finished with value: 0.8739843015987298 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.8741071732770184.


Early stopping, best iteration is:
[6243]	valid's auc: 0.873984 + 0.000731532


feature_fraction, val_score: 0.874107:  29%|##8       | 2/7 [10:33<27:02, 324.60s/it]

Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.842161 + 0.000831851
[400]	valid's auc: 0.85406 + 0.00127011
[600]	valid's auc: 0.859232 + 0.00103704
[800]	valid's auc: 0.862542 + 0.00116237
[1000]	valid's auc: 0.864745 + 0.00104094
[1200]	valid's auc: 0.866395 + 0.00088719
[1400]	valid's auc: 0.867597 + 0.000909247
[1600]	valid's auc: 0.868547 + 0.000798246
[1800]	valid's auc: 0.869314 + 0.000793793
[2000]	valid's auc: 0.869993 + 0.000778508
[2200]	valid's auc: 0.87058 + 0.000805785
[2400]	valid's auc: 0.87109 + 0.0007802
[2600]	valid's auc: 0.871536 + 0.000750728
[2800]	valid's auc: 0.871866 + 0.000689093
[3000]	valid's auc: 0.872171 + 0.000685924
[3200]	valid's auc: 0.872448 + 0.000712427
[3400]	valid's auc: 0.872736 + 0.000704434
[3600]	valid's auc: 0.8729 + 0.000657846
[3800]	valid's auc: 0.873067 + 0.000619138
[4000]	valid's auc: 0.873222 + 0.000629951
[4200]	valid's auc: 0.873332 + 0.000649273
[4400]	valid's auc: 0.873445 + 0.000656838
[4600]	v

feature_fraction, val_score: 0.874107:  43%|####2     | 3/7 [17:22<24:13, 363.33s/it][I 2025-06-04 11:59:56,979] Trial 2 finished with value: 0.8738887944105439 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.8741071732770184.
feature_fraction, val_score: 0.874107:  43%|####2     | 3/7 [17:22<24:13, 363.33s/it]

Early stopping, best iteration is:
[6301]	valid's auc: 0.873889 + 0.000671793
Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.843146 + 0.00113934
[400]	valid's auc: 0.85501 + 0.00143863
[600]	valid's auc: 0.860494 + 0.00109961
[800]	valid's auc: 0.863609 + 0.000982443
[1000]	valid's auc: 0.865863 + 0.00098619
[1200]	valid's auc: 0.867411 + 0.00102219
[1400]	valid's auc: 0.868547 + 0.00109691
[1600]	valid's auc: 0.869508 + 0.0010216
[1800]	valid's auc: 0.870263 + 0.00093042
[2000]	valid's auc: 0.870932 + 0.000923264
[2200]	valid's auc: 0.871428 + 0.00088891
[2400]	valid's auc: 0.871837 + 0.00084634
[2600]	valid's auc: 0.872213 + 0.000816838
[2800]	valid's auc: 0.872461 + 0.000818362
[3000]	valid's auc: 0.872762 + 0.000792672
[3200]	valid's auc: 0.872974 + 0.000807293
[3400]	valid's auc: 0.873134 + 0.000810242
[3600]	valid's auc: 0.873314 + 0.000787942
[3800]	valid's auc: 0.873505 + 0.000831391
[4000]	valid's auc: 0.873685 + 0.000795085
[4200]	valid's a

feature_fraction, val_score: 0.874243:  57%|#####7    | 4/7 [22:38<17:13, 344.50s/it][I 2025-06-04 12:05:12,604] Trial 3 finished with value: 0.8742430723457704 and parameters: {'feature_fraction': 0.7}. Best is trial 3 with value: 0.8742430723457704.
feature_fraction, val_score: 0.874243:  57%|#####7    | 4/7 [22:38<17:13, 344.50s/it]

Early stopping, best iteration is:
[5951]	valid's auc: 0.874243 + 0.000821072
Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.83955 + 0.000996415
[400]	valid's auc: 0.851256 + 0.00115059
[600]	valid's auc: 0.856942 + 0.00111254
[800]	valid's auc: 0.860357 + 0.00110288
