# Etape 3 - Optimisation des Hyperparam√®tres

Objectif: Optimiser les hyperparam√®tres des 2 meilleurs mod√®les (LightGBM et XGBoost) en utilisant GridSearchCV et Optuna avec MLflow tracking.

## 1. Import et configuration

In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from pathlib import Path
import warnings
import time
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

# Paths
ROOT_DIR = Path('.').resolve().parent
DATA_DIR = ROOT_DIR / 'outputs'
MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print(f"Root: {ROOT_DIR}")
print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

Root: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps
Data: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\outputs
Models: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\models


## 2. Charger les donn√©es

In [3]:
print("Chargement des donnees...")
train = pd.read_csv(DATA_DIR / 'train_processed.csv')
test = pd.read_csv(DATA_DIR / 'test_processed.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Verifier TARGET
print(f"\nDistribution TARGET:")
print(train['TARGET'].value_counts())

Chargement des donnees...
Train shape: (307511, 148)
Test shape: (48744, 121)

Distribution TARGET:
TARGET
0    282686
1     24825
Name: count, dtype: int64


## 3. Pr√©parer les donn√©es

In [4]:
# Separer X et y
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']

# Test
if 'TARGET' in test.columns:
    X_test = test.drop('TARGET', axis=1)
    y_test = test['TARGET']
else:
    X_test = test.copy()
    y_test = None

# Garder les ids
if 'SK_ID_CURR' in X_test.columns:
    test_ids = X_test['SK_ID_CURR'].copy()
    X_train = X_train.drop('SK_ID_CURR', axis=1, errors='ignore')
    X_test = X_test.drop('SK_ID_CURR', axis=1)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (307511, 146)
y_train shape: (307511,)
X_test shape: (48744, 120)


## 4. Configuration MLflow

In [5]:
# Configurer MLflow
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name = 'credit_scoring_v1'

# Creer experiment si elle n'existe pas
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment: {experiment_name}")
print(f"Experiment ID: {experiment_id}")

Experiment: credit_scoring_v1
Experiment ID: 1


## 5. Fonction de m√©triques

In [6]:
def calculate_metrics(y_true, y_pred, y_pred_proba=None, cost_fn=10, cost_fp=1):
    """
    Calcule les metriques de classification.
    """
    metrics = {}
    
    # Metriques standards
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    
    # AUC
    if y_pred_proba is not None:
        metrics['auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    # Co√ªt m√©tier
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['business_cost'] = fn * cost_fn + fp * cost_fp
    metrics['tn'] = int(tn)
    metrics['fp'] = int(fp)
    metrics['fn'] = int(fn)
    metrics['tp'] = int(tp)
    
    return metrics

print("Fonction de metriques: OK")

Fonction de metriques: OK


## 6. GridSearchCV - LightGBM

In [7]:
print("\n=== GRIDSEARCHCV - LIGHTGBM ===")

# Hyperparametres a optimiser
param_grid_lgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100]
}

scale_pos = (y_train == 0).sum() / (y_train == 1).sum()

base_model_lgb = lgb.LGBMClassifier(
    scale_pos_weight=scale_pos,
    random_state=42,
    verbosity=-1,
    n_jobs=-1
)

grid_search_lgb = GridSearchCV(
    base_model_lgb,
    param_grid_lgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print(f"Total combinations: {len(param_grid_lgb['n_estimators']) * len(param_grid_lgb['max_depth']) * len(param_grid_lgb['learning_rate']) * len(param_grid_lgb['num_leaves'])}")
print("Starting GridSearchCV...")

with mlflow.start_run(run_name="lightgbm_gridsearch"):
    start_time = time.time()
    grid_search_lgb.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    best_params_lgb = grid_search_lgb.best_params_
    best_model_lgb = grid_search_lgb.best_estimator_
    
    # Predictions avec meilleur modele
    y_pred_proba = cross_val_predict(best_model_lgb, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), method='predict_proba')
    y_pred = cross_val_predict(best_model_lgb, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
    
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    # Log
    mlflow.log_params(best_params_lgb)
    mlflow.log_metrics(metrics)
    mlflow.log_metric('gridsearch_best_cv_score', grid_search_lgb.best_score_)
    mlflow.log_metric('training_time', training_time)
    mlflow.sklearn.log_model(best_model_lgb, 'model')
    
    print(f"\nBest Params: {best_params_lgb}")
    print(f"Best CV Score (AUC): {grid_search_lgb.best_score_:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"Training time: {training_time:.2f}s")


=== GRIDSEARCHCV - LIGHTGBM ===
Total combinations: 81
Starting GridSearchCV...
Fitting 5 folds for each of 81 candidates, totalling 405 fits





Best Params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'num_leaves': 31}
Best CV Score (AUC): 0.7699
AUC: 0.7699
Accuracy: 0.7230
Business Cost: 157859
Training time: 681.39s
üèÉ View run lightgbm_gridsearch at: http://localhost:5000/#/experiments/1/runs/0e8fbdf54a8d45fc9083e0ec9579d281
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 7. GridSearchCV - XGBoost

In [8]:
print("\n=== GRIDSEARCHCV - XGBOOST ===")

# Hyperparametres a optimiser
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.9, 1.0]
}

base_model_xgb = xgb.XGBClassifier(
    scale_pos_weight=scale_pos,
    random_state=42,
    verbosity=0,
    use_label_encoder=False
)

grid_search_xgb = GridSearchCV(
    base_model_xgb,
    param_grid_xgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print(f"Total combinations: {len(param_grid_xgb['n_estimators']) * len(param_grid_xgb['max_depth']) * len(param_grid_xgb['learning_rate']) * len(param_grid_xgb['subsample'])}")
print("Starting GridSearchCV...")

with mlflow.start_run(run_name="xgboost_gridsearch"):
    start_time = time.time()
    grid_search_xgb.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    best_params_xgb = grid_search_xgb.best_params_
    best_model_xgb = grid_search_xgb.best_estimator_
    
    # Predictions avec meilleur modele
    y_pred_proba = cross_val_predict(best_model_xgb, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), method='predict_proba')
    y_pred = cross_val_predict(best_model_xgb, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
    
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    # Log
    mlflow.log_params(best_params_xgb)
    mlflow.log_metrics(metrics)
    mlflow.log_metric('gridsearch_best_cv_score', grid_search_xgb.best_score_)
    mlflow.log_metric('training_time', training_time)
    mlflow.sklearn.log_model(best_model_xgb, 'model')
    
    print(f"\nBest Params: {best_params_xgb}")
    print(f"Best CV Score (AUC): {grid_search_xgb.best_score_:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"Training time: {training_time:.2f}s")


=== GRIDSEARCHCV - XGBOOST ===
Total combinations: 81
Starting GridSearchCV...
Fitting 5 folds for each of 81 candidates, totalling 405 fits





Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV Score (AUC): 0.7688
AUC: 0.7686
Accuracy: 0.7241
Business Cost: 158555
Training time: 441.03s
üèÉ View run xgboost_gridsearch at: http://localhost:5000/#/experiments/1/runs/6b3af238e0fe4a3699bcc8d9f5f7d923
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 8. Comparaison avec baselines

In [9]:
print("\n=== COMPARAISON: BASELINE vs OPTIMIZED ===")

experiment = mlflow.get_experiment_by_name(experiment_name)
all_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Filtrer baseline et optimized
baseline_runs = all_runs[all_runs['tags.mlflow.runName'].str.contains('baseline', case=False, na=False)]
optimized_runs = all_runs[all_runs['tags.mlflow.runName'].str.contains('gridsearch', case=False, na=False)]

print(f"\nBaseline runs: {len(baseline_runs)}")
print(f"Optimized runs: {len(optimized_runs)}")

# Baseline
if len(baseline_runs) > 0:
    print("\n--- BASELINE MODELS ---")
    baseline_comparison = baseline_runs[['tags.mlflow.runName', 'metrics.auc', 'metrics.accuracy', 'metrics.business_cost', 'metrics.cv_auc_mean']].copy()
    baseline_comparison.columns = ['Model', 'AUC', 'Accuracy', 'Business Cost', 'CV AUC Mean']
    baseline_comparison = baseline_comparison.sort_values('CV AUC Mean', ascending=False)
    print(baseline_comparison.to_string(index=False))

# Optimized
if len(optimized_runs) > 0:
    print("\n--- OPTIMIZED MODELS ---")
    optimized_comparison = optimized_runs[['tags.mlflow.runName', 'metrics.auc', 'metrics.accuracy', 'metrics.business_cost', 'metrics.gridsearch_best_cv_score']].copy()
    optimized_comparison.columns = ['Model', 'AUC', 'Accuracy', 'Business Cost', 'Best CV Score']
    optimized_comparison = optimized_comparison.sort_values('AUC', ascending=False)
    print(optimized_comparison.to_string(index=False))


=== COMPARAISON: BASELINE vs OPTIMIZED ===

Baseline runs: 4
Optimized runs: 2

--- BASELINE MODELS ---
                       Model      AUC  Accuracy  Business Cost  CV AUC Mean
           lightgbm_baseline 0.765671  0.713236       159859.0     0.765709
            xgboost_baseline 0.765458  0.715265       159658.0     0.765482
logistic_regression_baseline 0.757313  0.695026       163281.0     0.757339
      random_forest_baseline 0.741323  0.738178       169766.0     0.741351

--- OPTIMIZED MODELS ---
              Model      AUC  Accuracy  Business Cost  Best CV Score
lightgbm_gridsearch 0.769903  0.722989       157859.0       0.769922
 xgboost_gridsearch 0.768570  0.724091       158555.0       0.768817


## Prochaines √©tapes

- **√âtape 4**: Feature Importance (SHAP) et Optimisation du seuil de d√©cision
- S√©lection du meilleur mod√®le optimis√©
- Model Registry dans MLflow