# 03 - Modélisation Credit Risk Scoring
 
**Phase:** 4 - Modélisation ML

---

## Objectifs

1. Charger le dataset de features (`features_v1.csv`)
2. Encoder les variables catégorielles
3. Split train/validation/test (70/15/15)
4. Entraîner un baseline XGBoost
5. Tuner les hyperparamètres avec Optuna
6. Évaluer le modèle final
7. Analyser avec SHAP

**Objectif performance:** AUC-ROC > 0.75

---
## 1. Setup et Imports

In [None]:
# Core
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, roc_curve, 
    precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report,
    f1_score, precision_score, recall_score
)
import xgboost as xgb

# Tuning
import optuna
from optuna.samplers import TPESampler

# Explainability
import shap

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
import joblib
from pathlib import Path
import json

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', 50)
RANDOM_STATE = 42

print("Imports OK")

---
## 2. Chargement des données

In [None]:
# Charger le dataset de features
DATA_PATH = Path('../data/features/features_v1.csv')

print(f"Chargement de {DATA_PATH}...")
df = pd.read_csv(DATA_PATH)

print(f"Shape: {df.shape}")
print(f"Mémoire: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
df.head()

In [None]:
# Vérifier la distribution de la cible
print("Distribution TARGET:")
print(df['target'].value_counts())
print(f"\nRatio défaut: {df['target'].mean()*100:.2f}%")
print(f"Ratio classe (neg/pos): {(df['target']==0).sum() / (df['target']==1).sum():.1f}")

---
## 3. Préparation des données

### 3.1 Identifier les types de colonnes

In [None]:
# Identifier les colonnes par type
id_col = 'sk_id_curr'
target_col = 'target'

# Colonnes catégorielles (type object)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Colonnes numériques
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c not in [id_col, target_col]]

print(f"ID: {id_col}")
print(f"Target: {target_col}")
print(f"Colonnes catégorielles: {len(cat_cols)}")
print(f"Colonnes numériques: {len(num_cols)}")

if cat_cols:
    print(f"\nCatégorielles: {cat_cols[:10]}..." if len(cat_cols) > 10 else f"\nCatégorielles: {cat_cols}")

### 3.2 Encodage des variables catégorielles

In [None]:
# Cardinalité des variables catégorielles
if cat_cols:
    cardinality = {col: df[col].nunique() for col in cat_cols}
    cardinality_df = pd.DataFrame.from_dict(cardinality, orient='index', columns=['unique_values'])
    cardinality_df = cardinality_df.sort_values('unique_values', ascending=False)
    print("Cardinalité des variables catégorielles:")
    print(cardinality_df)

In [None]:
# Encoder les catégorielles avec LabelEncoder
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    # Gérer les NaN
    df[col] = df[col].fillna('MISSING')
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    
print(f"Encodées: {len(cat_cols)} colonnes")

# Vérifier qu'il n'y a plus de colonnes object
remaining_object = df.select_dtypes(include=['object']).columns.tolist()
print(f"Colonnes object restantes: {remaining_object}")

### 3.3 Préparation X et y

In [None]:
# Séparer features et target
feature_cols = [c for c in df.columns if c not in [id_col, target_col]]

X = df[feature_cols]
y = df[target_col]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {len(feature_cols)}")

### 3.4 Split Train / Validation / Test

In [None]:
# Split 70% train / 15% validation / 15% test
# Stratified pour garder le ratio de classes

# D'abord: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.30, 
    random_state=RANDOM_STATE,
    stratify=y
)

# Ensuite: 50% val, 50% test (du temp = 15% chacun du total)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

print("Split effectué:")
print(f"  Train: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Valid: {X_val.shape[0]:,} ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  Test:  {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nRatio défaut par set:")
print(f"  Train: {y_train.mean()*100:.2f}%")
print(f"  Valid: {y_val.mean()*100:.2f}%")
print(f"  Test:  {y_test.mean()*100:.2f}%")

---
## 4. Baseline Model

In [None]:
# Calculer scale_pos_weight pour gérer le déséquilibre
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

In [None]:
# Baseline XGBoost avec paramètres par défaut
baseline_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    eval_metric='auc'
)

print("Entraînement du baseline...")
baseline_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

print("Baseline entraîné!")

In [None]:
# Évaluation baseline sur validation
y_pred_proba_baseline = baseline_model.predict_proba(X_val)[:, 1]
auc_baseline = roc_auc_score(y_val, y_pred_proba_baseline)
gini_baseline = 2 * auc_baseline - 1

print("=" * 40)
print("BASELINE RESULTS (Validation Set)")
print("=" * 40)
print(f"AUC-ROC: {auc_baseline:.4f}")
print(f"Gini:    {gini_baseline:.4f}")

---
## 5. Hyperparameter Tuning avec Optuna

In [None]:
def objective(trial):
    """Fonction objectif pour Optuna."""
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
        'eval_metric': 'auc'
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred_proba)
    
    return auc

In [None]:
# Lancer l'optimisation Optuna
# Ajuster n_trials selon le temps disponible (100 = ~30-60 min)

N_TRIALS = 100  # Modifier si besoin

print(f"Démarrage Optuna avec {N_TRIALS} trials...")
print("Cela peut prendre 30-60 minutes.\n")

sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(direction='maximize', sampler=sampler)

# Callback pour afficher la progression
def print_callback(study, trial):
    if trial.number % 10 == 0:
        print(f"Trial {trial.number}: AUC = {trial.value:.4f} (Best: {study.best_value:.4f})")

study.optimize(objective, n_trials=N_TRIALS, callbacks=[print_callback], show_progress_bar=True)

print("\n" + "=" * 40)
print("OPTUNA TERMINÉ")
print("=" * 40)
print(f"Meilleur AUC: {study.best_value:.4f}")
print(f"Amélioration vs baseline: +{(study.best_value - auc_baseline)*100:.2f}%")

In [None]:
# Meilleurs paramètres
print("Meilleurs paramètres:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

---
## 6. Modèle Final

In [None]:
# Entraîner le modèle final avec les meilleurs paramètres
best_params = study.best_params
best_params['scale_pos_weight'] = scale_pos_weight
best_params['random_state'] = RANDOM_STATE
best_params['n_jobs'] = -1
best_params['eval_metric'] = 'auc'

final_model = xgb.XGBClassifier(**best_params)

print("Entraînement du modèle final...")
final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

print("Modèle final entraîné!")

---
## 7. Évaluation Finale (Test Set)

In [None]:
# Prédictions sur le test set (jamais vu pendant l'entraînement)
y_pred_proba_test = final_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_pred_proba_test >= 0.5).astype(int)

# Métriques
auc_test = roc_auc_score(y_test, y_pred_proba_test)
gini_test = 2 * auc_test - 1
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print("=" * 50)
print("RÉSULTATS FINAUX (Test Set - Données jamais vues)")
print("=" * 50)
print(f"AUC-ROC:   {auc_test:.4f}")
print(f"Gini:      {gini_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall:    {recall_test:.4f}")
print(f"F1-Score:  {f1_test:.4f}")
print("=" * 50)

if auc_test >= 0.75:
    print("\n>>> OBJECTIF ATTEINT: AUC > 0.75 <<<")
else:
    print(f"\n>>> Objectif non atteint. Manque: {0.75 - auc_test:.4f} <<<")

In [None]:
# Matrice de confusion
cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Pas de défaut', 'Défaut'],
            yticklabels=['Pas de défaut', 'Défaut'])
plt.xlabel('Prédit')
plt.ylabel('Réel')
plt.title('Matrice de Confusion - Test Set')
plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Pas de défaut', 'Défaut']))

In [None]:
# Courbe ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_test)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, 'b-', linewidth=2, label=f'Modèle (AUC = {auc_test:.4f})')
plt.plot([0, 1], [0, 1], 'r--', linewidth=1, label='Random (AUC = 0.5)')
plt.xlabel('Taux de Faux Positifs (FPR)')
plt.ylabel('Taux de Vrais Positifs (TPR)')
plt.title('Courbe ROC - Credit Risk Scoring')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Courbe Precision-Recall
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba_test)
ap = average_precision_score(y_test, y_pred_proba_test)

plt.figure(figsize=(10, 6))
plt.plot(recall_curve, precision_curve, 'g-', linewidth=2, label=f'Modèle (AP = {ap:.4f})')
plt.axhline(y=y_test.mean(), color='r', linestyle='--', label=f'Baseline ({y_test.mean():.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Courbe Precision-Recall')
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## 8. Explicabilité SHAP

In [None]:
# Calculer les SHAP values (peut prendre quelques minutes)
print("Calcul des SHAP values...")

# Utiliser un échantillon pour accélérer
X_sample = X_test.sample(n=min(1000, len(X_test)), random_state=RANDOM_STATE)

explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_sample)

print("SHAP values calculés!")

In [None]:
# Feature Importance globale (SHAP)
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_sample, plot_type='bar', show=False, max_display=20)
plt.title('Top 20 Features - Importance SHAP')
plt.tight_layout()
plt.show()

In [None]:
# Summary plot (impact sur la prédiction)
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_sample, show=False, max_display=20)
plt.title('Impact des Features sur la Prédiction')
plt.tight_layout()
plt.show()

In [None]:
# Top 10 features les plus importantes
feature_importance = pd.DataFrame({
    'feature': X_sample.columns,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values('importance', ascending=False)

print("Top 10 Features:")
print(feature_importance.head(10).to_string(index=False))

---
## 9. Sauvegarde du Modèle

In [None]:
# Créer le dossier models si nécessaire
MODELS_PATH = Path('../models')
MODELS_PATH.mkdir(exist_ok=True)

# Sauvegarder le modèle
model_path = MODELS_PATH / 'xgboost_credit_risk_v1.pkl'
joblib.dump(final_model, model_path)
print(f"Modèle sauvegardé: {model_path}")

# Sauvegarder les noms de features
features_path = MODELS_PATH / 'feature_names.json'
with open(features_path, 'w') as f:
    json.dump(feature_cols, f)
print(f"Features sauvegardées: {features_path}")

# Sauvegarder les label encoders
encoders_path = MODELS_PATH / 'label_encoders.pkl'
joblib.dump(label_encoders, encoders_path)
print(f"Encoders sauvegardés: {encoders_path}")

# Sauvegarder les métriques
metrics = {
    'auc_roc': float(auc_test),
    'gini': float(gini_test),
    'precision': float(precision_test),
    'recall': float(recall_test),
    'f1_score': float(f1_test),
    'best_params': best_params
}
metrics_path = MODELS_PATH / 'metrics.json'
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2, default=str)
print(f"Métriques sauvegardées: {metrics_path}")

---
## 10. Résumé

In [None]:
print("=" * 60)
print("RÉSUMÉ - PHASE 4 MODÉLISATION")
print("=" * 60)
print(f"\nDataset: {df.shape[0]:,} lignes × {len(feature_cols)} features")
print(f"\nSplit:")
print(f"  - Train: {len(X_train):,}")
print(f"  - Valid: {len(X_val):,}")
print(f"  - Test:  {len(X_test):,}")
print(f"\nRésultats:")
print(f"  - Baseline AUC: {auc_baseline:.4f}")
print(f"  - Final AUC:    {auc_test:.4f}")
print(f"  - Amélioration: +{(auc_test - auc_baseline)*100:.2f}%")
print(f"\nTop 5 Features:")
for i, row in feature_importance.head(5).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")
print(f"\nFichiers sauvegardés:")
print(f"  - {model_path}")
print(f"  - {features_path}")
print(f"  - {encoders_path}")
print(f"  - {metrics_path}")
print("\n" + "=" * 60)
print("PHASE 4 TERMINÉE")
print("=" * 60)