# 05 - Hyperparameter Tuning (classification)

Objectif : optimiser les modèles de classification (priorité rappel de la classe "fail") via GridSearchCV et RandomizedSearchCV, comparer aux baselines et sauvegarder le meilleur modèle.

## 1. Charger et préparer les données
- Chargement `data/processed/train.csv` et `test.csv`
- Drop des colonnes d'index sauvegardées
- Définition des tâches (language, math, exam) avec features catégorielles
- Préprocessing : OneHotEncoder pour toutes les features cat (éviter l'ordinalité)

In [1]:
# Imports & data loading
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, recall_score, f1_score, accuracy_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

sns.set_style("whitegrid")
RANDOM_STATE = 42
CV_SPLITS = 5

# Paths
train_path = Path('..') / 'data' / 'processed' / 'train.csv'
test_path = Path('..') / 'data' / 'processed' / 'test.csv'

for p in [train_path, test_path]:
    assert p.exists(), f"Missing file: {p}"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Drop stray index columns
cols_to_drop = [c for c in train_df.columns if c.startswith('Unnamed') or c == '']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)

base_cols = [
    "gender",
    "race/ethnicity",
    "parental level of education",
    "lunch",
    "test preparation course",
]

tasks = {
    "language": {"target": "language passed", "features": base_cols},
    "math": {"target": "math passed", "features": base_cols},
    "exam": {"target": "exam passed", "features": base_cols},
}

# Build splits per task (no split train/val here because CV handles it)
data_splits = {}
for name, spec in tasks.items():
    X_train = train_df[spec["features"]]
    y_train = train_df[spec["target"]]
    X_test = test_df[spec["features"]]
    y_test = test_df[spec["target"]]
    data_splits[name] = {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test}

categorical_cols = base_cols
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
], remainder="drop")

recall_fail_scorer = make_scorer(recall_score, pos_label=0)
scoring = {
    "recall_fail": recall_fail_scorer,
    "f1": "f1",
    "accuracy": "accuracy",
    "roc_auc": "roc_auc",
}

cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)


## 2. Définir le modèle de base
Baseline : pipelines simples (LogReg, RF, GBoost, XGB, LGBM, Linear SVM calibré) avec pondération de classe. Mesure principale : recall_fail (classe 0).

In [2]:
# Baselines (option: skip exam if pretrained models are available)
USE_PRETRAINED_EXAM = True


def make_models_for_task(y_train):
    n_fail = max((y_train == 0).sum(), 1)
    n_pass = max((y_train == 1).sum(), 1)
    fail_weight = n_pass / n_fail
    scale_pos_weight = n_fail / n_pass

    return {
        "log_reg": Pipeline([
            ("prep", preprocessor),
            ("clf", LogisticRegression(max_iter=1000, n_jobs=-1, class_weight="balanced", solver="lbfgs")),
        ]),
        "rf": Pipeline([
            ("prep", preprocessor),
            ("clf", RandomForestClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1, class_weight="balanced")),
        ]),
        "gboost": Pipeline([
            ("prep", preprocessor),
            ("clf", GradientBoostingClassifier(random_state=RANDOM_STATE)),
        ]),
        "xgb": Pipeline([
            ("prep", preprocessor),
            ("clf", XGBClassifier(
                n_estimators=400,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                eval_metric="logloss",
                random_state=RANDOM_STATE,
                n_jobs=-1,
                scale_pos_weight=scale_pos_weight,
            )),
        ]),
        "lgbm": Pipeline([
            ("prep", preprocessor),
            ("clf", LGBMClassifier(
                n_estimators=400,
                max_depth=-1,
                num_leaves=31,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                random_state=RANDOM_STATE,
                class_weight={0: fail_weight, 1: 1.0},
            )),
        ]),
        "linear_svm": Pipeline([
            ("prep", preprocessor),
            ("clf", CalibratedClassifierCV(
                estimator=LinearSVC(class_weight="balanced"),
                cv=3,
                n_jobs=-1,
            )),
        ]),
    }


def cv_eval(model, X, y):
    cv_res = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )
    return {k.replace("test_", ""): np.mean(v) for k, v in cv_res.items() if k.startswith("test_")}

# Baseline CV per task (skip exam if using pretrained)
baseline_rows = []
baseline_results = {}
models_by_task = {}
for task, split in data_splits.items():
    if USE_PRETRAINED_EXAM and task == "exam":
        continue
    models = make_models_for_task(split["y_train"])
    models_by_task[task] = models
    baseline_results[task] = {}
    for name, model in models.items():
        res = cv_eval(model, split["X_train"], split["y_train"])
        baseline_results[task][name] = res
        baseline_rows.append({"task": task, "model": name, **res})

baseline_df = pd.DataFrame(baseline_rows).sort_values(["task", "recall_fail"], ascending=[True, False])
baseline_df.head()

Unnamed: 0,task,model,recall_fail,f1,accuracy,roc_auc
0,language,log_reg,0.634662,0.77999,0.7,0.75534
3,language,xgb,0.601772,0.759311,0.67375,0.713811
4,language,lgbm,0.592691,0.740367,0.6525,0.693787
1,language,rf,0.516611,0.756828,0.6625,0.671325
2,language,gboost,0.307752,0.835977,0.74125,0.720681


## 3. GridSearchCV (espace restreint, explicite)
- Modèle ciblé : XGBClassifier
- Justification : profondeur modérée, lr faible, subsample/colsample pour limiter l’overfit, scale_pos_weight dérivé du ratio classes.
- Param_grid (exemple) :
  - max_depth: [3, 4, 5]
  - learning_rate: [0.02, 0.05, 0.1]
  - n_estimators: [200, 400, 600]
  - subsample: [0.7, 0.9]
  - colsample_bytree: [0.7, 0.9]
  - scale_pos_weight: [ratio, ratio*0.7, ratio*1.3] (ratio = n_fail/n_pass)

## 3.1 Utiliser les modèles exam déjà entraînés (04b)
Si disponibles dans `models/exam_*.joblib`, on les charge pour éviter de réentraîner la baseline exam.

In [3]:
# Charger les modèles exam déjà entraînés (si présents dans models/exam_*.joblib)
exam_split = data_splits["exam"]
pretrained_exam_rows = []
models_dir = Path("..") / "models" / "classification"
if USE_PRETRAINED_EXAM and models_dir.exists():
    for path in models_dir.glob("exam_*.joblib"):
        model_name = path.stem.replace("exam_", "")
        model = joblib.load(path)
        preds = model.predict(exam_split["X_test"])
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(exam_split["X_test"])[:, 1]
            auc = roc_auc_score(exam_split["y_test"], proba)
        elif hasattr(model, "decision_function"):
            proba = model.decision_function(exam_split["X_test"])
            auc = roc_auc_score(exam_split["y_test"], proba)
        else:
            proba = None
            auc = np.nan
        pretrained_exam_rows.append(
            {
                "task": "exam",
                "model": model_name,
                "accuracy": accuracy_score(exam_split["y_test"], preds),
                "f1": f1_score(exam_split["y_test"], preds),
                "recall_fail": recall_score(exam_split["y_test"], preds, pos_label=0),
                "roc_auc": auc,
            }
        )

pretrained_exam_df = pd.DataFrame(pretrained_exam_rows).sort_values("recall_fail", ascending=False)
pretrained_exam_df.head()

[WinError 2] Le fichier spécifié est introuvable
  File "c:\Users\nicol\anaconda3\envs\Data\Lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nicol\anaconda3\envs\Data\Lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\nicol\anaconda3\envs\Data\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nicol\anaconda3\envs\Data\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\nicol\anaconda3\envs\Data\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       

Unnamed: 0,task,model,accuracy,f1,recall_fail,roc_auc
1,exam,log_reg,0.595,0.672065,0.654545,0.682821
2,exam,xgb,0.615,0.698039,0.618182,0.646583
0,exam,lgbm,0.59,0.674603,0.6,0.629655


In [4]:
# Grid search XGB (task exam)
exam_split = data_splits["exam"]
X_exam, y_exam = exam_split["X_train"], exam_split["y_train"]
n_fail = max((y_exam == 0).sum(), 1)
n_pass = max((y_exam == 1).sum(), 1)
fail_pass_ratio = n_fail / n_pass

xgb_base = Pipeline([
    ("prep", preprocessor),
    (
        "clf",
        XGBClassifier(
            random_state=RANDOM_STATE,
            eval_metric="logloss",
            n_jobs=-1,
            use_label_encoder=False,
            scale_pos_weight=fail_pass_ratio,
        ),
    ),
])

xgb_param_grid = {
    "clf__max_depth": [3, 4, 5],
    "clf__learning_rate": [0.02, 0.05, 0.1],
    "clf__n_estimators": [200, 400, 600],
    "clf__subsample": [0.7, 0.9],
    "clf__colsample_bytree": [0.7, 0.9],
    "clf__scale_pos_weight": [fail_pass_ratio * f for f in [0.7, 1.0, 1.3]],
}

grid_xgb = GridSearchCV(
    xgb_base,
    xgb_param_grid,
    scoring=scoring,
    refit="recall_fail",
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

grid_xgb.fit(X_exam, y_exam)

def extract_best_scores(search):
    idx = search.best_index_
    res = search.cv_results_
    return {
        "recall_fail": res["mean_test_recall_fail"][idx],
        "f1": res["mean_test_f1"][idx],
        "accuracy": res["mean_test_accuracy"][idx],
        "roc_auc": res["mean_test_roc_auc"][idx],
    }

xgb_grid_scores = extract_best_scores(grid_xgb)
pd.DataFrame([
    {"model": "xgb_grid", **xgb_grid_scores, **{k.replace("clf__", "param_"): v for k, v in grid_xgb.best_params_.items()}},
])

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,model,recall_fail,f1,accuracy,roc_auc,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_scale_pos_weight,param_subsample
0,xgb_grid,0.807879,0.616588,0.57125,0.710984,0.9,0.02,3,200,0.272222,0.9


## 4. RandomizedSearchCV (espace plus large)
- Modèle ciblé : LGBMClassifier
- Justification : balayage plus large via distributions continues pour lr/subsample/colsample et discrètes pour num_leaves/n_estimators.
- n_iter modéré pour rester raisonnable en temps.

In [5]:
# Randomized search LGBM (task exam)
fail_weight = n_pass / n_fail

lgbm_base = Pipeline([
    ("prep", preprocessor),
    (
        "clf",
        LGBMClassifier(
            random_state=RANDOM_STATE,
            class_weight={0: fail_weight, 1: 1.0},
            n_jobs=-1,
        ),
    ),
])

lgbm_param_dist = {
    "clf__num_leaves": randint(15, 64),
    "clf__max_depth": [-1, 4, 6, 8],
    "clf__learning_rate": uniform(0.02, 0.15),
    "clf__n_estimators": randint(200, 601),
    "clf__subsample": uniform(0.6, 0.4),
    "clf__colsample_bytree": uniform(0.6, 0.4),
}

rand_lgbm = RandomizedSearchCV(
    lgbm_base,
    lgbm_param_dist,
    n_iter=25,
    scoring=scoring,
    refit="recall_fail",
    cv=cv,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=1,
)

rand_lgbm.fit(X_exam, y_exam)

lgbm_rand_scores = extract_best_scores(rand_lgbm)
pd.DataFrame([
    {"model": "lgbm_random", **lgbm_rand_scores, **{k.replace("clf__", "param_"): v for k, v in rand_lgbm.best_params_.items()}},
])

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Number of positive: 576, number of negative: 224
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


Unnamed: 0,model,recall_fail,f1,accuracy,roc_auc,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_num_leaves,param_subsample
0,lgbm_random,0.598485,0.729212,0.64625,0.688556,0.920879,0.031183,6,208,38,0.757953


## 5. Comparaison baselines vs modèles tunés (CV)
- Agrégation des scores CV (recall_fail prioritaire).
- Sélection du meilleur modèle tuned sur la tâche exam.

In [6]:
def summarize_search(name, search):
    scores = extract_best_scores(search)
    return {"task": "exam", "model": name, **scores}

exam_baseline = (
    pretrained_exam_df.copy()
    if USE_PRETRAINED_EXAM and not pretrained_exam_df.empty
    else baseline_df[baseline_df["task"] == "exam"].copy()
)

# Si on a sauté l'exam baseline et qu'aucun modèle pré-entraîné n'est présent, prévenir
if exam_baseline.empty:
    print("Aucun baseline exam disponible : vérifiez models/exam_*.joblib ou désactivez USE_PRETRAINED_EXAM.")


tuned_rows = [
    summarize_search("xgb_grid", grid_xgb),
    summarize_search("lgbm_random", rand_lgbm),
]
tuned_df = pd.DataFrame(tuned_rows)

comparison_df = pd.concat(
    [exam_baseline.assign(type="baseline"), tuned_df.assign(type="tuned")],
    ignore_index=True,
)
comparison_df.sort_values("recall_fail", ascending=False).head(10)

Unnamed: 0,task,model,accuracy,f1,recall_fail,roc_auc,type
3,exam,xgb_grid,0.57125,0.616588,0.807879,0.710984,tuned
0,exam,log_reg,0.595,0.672065,0.654545,0.682821,baseline
1,exam,xgb,0.615,0.698039,0.618182,0.646583,baseline
2,exam,lgbm,0.59,0.674603,0.6,0.629655,baseline
4,exam,lgbm_random,0.64625,0.729212,0.598485,0.688556,tuned


## 6. Évaluation sur test + sauvegarde du meilleur modèle
- Sélection du meilleur tuned (rappel_fail CV).
- Refit sur tout le train exam, évaluation sur test, sauvegarde `models/exam_tuned.joblib`.

In [7]:
search_map = {
    "xgb_grid": grid_xgb,
    "lgbm_random": rand_lgbm,
}

best_row = (
    comparison_df[comparison_df["type"] == "tuned"]
    .sort_values("recall_fail", ascending=False)
    .iloc[0]
)
best_name = best_row["model"]
best_search = search_map[best_name]

best_estimator = best_search.best_estimator_
best_estimator.fit(X_exam, y_exam)

X_test_exam, y_test_exam = exam_split["X_test"], exam_split["y_test"]
test_pred = best_estimator.predict(X_test_exam)
if hasattr(best_estimator, "predict_proba"):
    test_proba = best_estimator.predict_proba(X_test_exam)[:, 1]
    test_auc = roc_auc_score(y_test_exam, test_proba)
else:
    test_proba = None
    test_auc = np.nan

test_metrics = {
    "recall_fail": recall_score(y_test_exam, test_pred, pos_label=0),
    "f1": f1_score(y_test_exam, test_pred),
    "accuracy": accuracy_score(y_test_exam, test_pred),
    "roc_auc": test_auc,
}

report = classification_report(y_test_exam, test_pred, target_names=["fail", "pass"], output_dict=True)

models_dir = Path("..") / "models"
models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / "exam_tuned.joblib"
joblib.dump(best_estimator, model_path)

print(f"Best tuned model: {best_name}")
pd.DataFrame([test_metrics])
pd.DataFrame(report).T
print(f"Model saved to {model_path}")

Best tuned model: xgb_grid
Model saved to ..\models\exam_tuned.joblib


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
