In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [81]:
# === 1. Charger les données ===
train = pd.read_csv("train.csv", sep=",")
test = pd.read_csv("test.csv", sep=",")

train.info()

# Sauvegarder les IDs pour la soumission finale
test_ids = test['auctionId']

# Colonnes à ignorer
drop_cols = ['auctionId', 'hashedRefererDeepThree']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967332 entries, 0 to 967331
Data columns (total 14 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   auctionId                      967332 non-null  object
 1   timeStamp                      967332 non-null  int64 
 2   placementId                    967332 non-null  int64 
 3   websiteId                      967332 non-null  int64 
 4   hashedRefererDeepThree         951349 non-null  object
 5   country                        966900 non-null  object
 6   opeartingSystem                967332 non-null  object
 7   browser                        967332 non-null  object
 8   browserVersion                 837388 non-null  object
 9   device                         967332 non-null  object
 10  environmentType                967328 non-null  object
 11  integrationType                967332 non-null  int64 
 12  articleSafenessCategorization  967332 non-nu

In [77]:
# === 2. Préparer les données ===
X_train = train.drop(columns=drop_cols + ['isSold'])
y_train = train['isSold']

X_test = test.drop(columns=drop_cols)

# Convertir les colonnes object en 'category' pour XGBoost
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

In [78]:
# === 2. Hyperparamètres à tester ===
n_estimators_list = range(1, 201, 50)

# === 3. Validation croisée ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 4. Stockage des scores ===
scores_dict = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'auc': []
}

In [79]:
# # === 5. Boucle principale ===
# for n in n_estimators_list:
#     print("Estimators :", n)
#     accs, precs, recs, f1s, aucs = [], [], [], [], []

#     for train_idx, val_idx in cv.split(X, y):

#         X_clean = X.copy()
#         for col in X_clean.select_dtypes(include='object'):
#             X_clean[col] = X_clean[col].astype('category')

#         X_train, X_val = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         model = XGBClassifier(n_estimators=n, eval_metric='logloss', enable_categorical=True)
#         model.fit(X_train, y_train)

#         y_pred = model.predict(X_val)
#         y_prob = model.predict_proba(X_val)[:, 1]

#         accs.append(accuracy_score(y_val, y_pred))
#         precs.append(precision_score(y_val, y_pred, zero_division=0))
#         recs.append(recall_score(y_val, y_pred, zero_division=0))
#         f1s.append(f1_score(y_val, y_pred, zero_division=0))
#         aucs.append(roc_auc_score(y_val, y_prob))

#     # Moyenne des scores pour ce n_estimators
#     scores_dict['accuracy'].append(np.mean(accs))
#     scores_dict['precision'].append(np.mean(precs))
#     scores_dict['recall'].append(np.mean(recs))
#     scores_dict['f1'].append(np.mean(f1s))
#     scores_dict['auc'].append(np.mean(aucs))

# # === 6. Tracer les courbes ===
# plt.figure(figsize=(10, 6))
# for metric, values in scores_dict.items():
#     print(metric, len(values))
#     plt.plot(n_estimators_list, values, label=metric)

# plt.xlabel("n_estimators")
# plt.ylabel("Score (CV 5 folds)")
# plt.title("Évolution des scores selon n_estimators (XGBoost)")
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

In [80]:
import optuna, numpy as np, pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score)

# ---------- 1. Préparez X_train / y_train comme avant ----------
# (colonnes object → category, etc.)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------- 2. Définissez l'objectif Optuna ----------
def objective(trial):

    # échantillonnage des hyper-paramètres
    params = {
        "n_estimators":      trial.suggest_int("n_estimators",  50, 400, step=25),
        "max_depth":         trial.suggest_int("max_depth",     3, 10),
        "learning_rate":     trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample":         trial.suggest_float("subsample",   0.6, 1.0, step=0.1),
        "colsample_bytree":  trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.1),
        # paramètres fixes
        "eval_metric": "logloss",
        "enable_categorical": True,
        "use_label_encoder": False,
        "tree_method": "hist"          # rapide sur CPU
    }

    model = XGBClassifier(**params)

    # stocker les scores de chaque fold
    accs, precs, recs, f1s, aucs = [], [], [], [], []

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        y_prob = model.predict_proba(X_val)[:, 1]

        accs.append(accuracy_score(y_val, y_pred))
        precs.append(precision_score(y_val, y_pred, zero_division=0))
        recs.append(recall_score(y_val, y_pred, zero_division=0))
        f1s.append(f1_score(y_val, y_pred, zero_division=0))
        aucs.append(roc_auc_score(y_val, y_prob))

    # métrique d’optimisation principale
    mean_f1 = np.mean(f1s)

    # enregistrer les autres pour inspection
    trial.set_user_attr("accuracy",  np.mean(accs))
    trial.set_user_attr("precision", np.mean(precs))
    trial.set_user_attr("recall",    np.mean(recs))
    trial.set_user_attr("roc_auc",   np.mean(aucs))

    return mean_f1     # Optuna va « maximiser » cette valeur

# ---------- 3. Lancement de l’étude ----------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=None)  # ↗ augmentez n_trials si besoin

# ---------- 4. Résultats ----------
best = study.best_trial
print("🎯 Best F1        :", best.value)
print("✅ Best params    :", best.params)
print("• Accuracy  (CV) :", best.user_attrs['accuracy'])
print("• Precision (CV) :", best.user_attrs['precision'])
print("• Recall    (CV) :", best.user_attrs['recall'])
print("• ROC-AUC   (CV) :", best.user_attrs['roc_auc'])

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [None]:
sample_sub = pd.read_csv('testSubmissionFile.csv')

# ---------- 5. Entraînez le meilleur modèle sur tout le train ----------
best_model = XGBClassifier(**best.params,
                           eval_metric='logloss',
                           enable_categorical=True,
                           use_label_encoder=False)
best_model.fit(X_train, y_train)

# ---------- 6. Prédictions finales ----------
y_test_pred = best_model.predict(X_test)


# === 6. Export (optionnel pour soumission) ===
submission = pd.DataFrame({
    "auctionId": test_ids,
    "isSold": y_test_pred
})

submission.to_csv("submission.csv", index=False)