# Etape 4 - Feature Importance & Optimisation du Seuil

Objectif: Analyser l'importance des features avec SHAP et optimiser le seuil de d√©cision bas√© sur le co√ªt m√©tier.

## 1. Import et configuration

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from pathlib import Path
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# Paths
ROOT_DIR = Path('.').resolve().parent
DATA_DIR = ROOT_DIR / 'outputs'
MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print(f"Root: {ROOT_DIR}")
print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

Root: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps
Data: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\outputs
Models: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\models


## 2. Charger les donn√©es

In [2]:
print("Chargement des donnees...")

train = pd.read_parquet(DATA_DIR / "train_processed.parquet")
test = pd.read_parquet(DATA_DIR / "test_processed.parquet")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Chargement des donnees...
Train shape: (307511, 158)
Test shape: (48744, 131)


## 3. Pr√©parer les donn√©es

In [3]:
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']

if 'TARGET' in test.columns:
    X_test = test.drop('TARGET', axis=1)
    y_test = test['TARGET']
else:
    X_test = test.copy()
    y_test = None

if 'SK_ID_CURR' in X_test.columns:
    test_ids = X_test['SK_ID_CURR'].copy()
    X_train = X_train.drop('SK_ID_CURR', axis=1, errors='ignore')
    X_test = X_test.drop('SK_ID_CURR', axis=1)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (307511, 156)
y_train shape: (307511,)


## 4. Configuration MLflow

In [4]:
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name = 'credit_scoring_v1'

try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment: {experiment_name}")

Experiment: credit_scoring_v1


## 5. Charger le meilleur mod√®le optimis√©

In [5]:
experiment = mlflow.get_experiment_by_name(experiment_name)
all_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Filtrer les runs LightGBM (baseline ou gridsearch)
lgb_runs = all_runs[all_runs['tags.mlflow.runName'].str.contains('lightgbm', case=False, na=False)]

if len(lgb_runs) > 0:
    # S√©lectionner le run avec le co√ªt m√©tier minimal
    best_idx = lgb_runs['metrics.business_cost'].astype(float).idxmin()
    best_run = lgb_runs.loc[best_idx]
    best_run_id = best_run['run_id']
    print(f"Best LightGBM run: {best_run_id} (business_cost={best_run['metrics.business_cost']})")
    
    model_uri = f"runs:/{best_run_id}/model"
    best_model = mlflow.sklearn.load_model(model_uri)
    print(f"Mod√®le charg√©: {type(best_model)}")
else:
    print("Aucun mod√®le LightGBM trouv√©")

Best LightGBM run: 7e387338cecd4f77b136c2e34aad7c61 (business_cost=157922.0)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mod√®le charg√©: <class 'sklearn.pipeline.Pipeline'>


## 6. Fonction de m√©triques avec seuil personnalis√©

## 7. Optimisation du seuil de d√©cision

In [6]:
def calculate_metrics_with_threshold(y_true, y_proba, threshold, cost_fn=10, cost_fp=1):
    y_pred = (y_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "auc": roc_auc_score(y_true, y_proba),
        "business_cost": fn * cost_fn + fp * cost_fp,
        "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)
    }

print("\n=== OPTIMISATION DU SEUIL ===")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_proba_cv = cross_val_predict(best_model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]

# Grille fine de seuils
thresholds = np.arange(0.01, 1.0, 0.01)
results = []

for threshold in thresholds:
    metrics = calculate_metrics_with_threshold(y_train, y_proba_cv, threshold)
    results.append({
        'threshold': threshold,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1'],
        'business_cost': metrics['business_cost'],
        'tn': metrics['tn'],
        'fp': metrics['fp'],
        'fn': metrics['fn'],
        'tp': metrics['tp']
    })

results_df = pd.DataFrame(results)

best_threshold = results_df.loc[results_df['business_cost'].idxmin(), 'threshold']
min_cost = results_df['business_cost'].min()

print(f"\nSeuil optimal (co√ªt minimal): {best_threshold:.2f}")
print(f"Co√ªt m√©tier minimal: {min_cost:.0f}‚Ç¨")
print(f"\nTop 5 thresholds par co√ªt:")
print(results_df.nsmallest(5, 'business_cost').to_string(index=False))

# Afficher la matrice de confusion pour le seuil optimal et le seuil baseline (0.5)
for t, label in zip([best_threshold, 0.5], ["optimal", "baseline"]):
    m = calculate_metrics_with_threshold(y_train, y_proba_cv, t)
    print(f"\nConfusion matrix ({label}, threshold={t:.2f}):")
    print(f"   TN={m['tn']}, FP={m['fp']}, FN={m['fn']}, TP={m['tp']}")


=== OPTIMISATION DU SEUIL ===

Seuil optimal (co√ªt minimal): 0.51
Co√ªt m√©tier minimal: 157764‚Ç¨

Top 5 thresholds par co√ªt:
 threshold  accuracy  precision   recall       f1  business_cost     tn    fp   fn    tp
      0.51  0.726985   0.179956 0.669648 0.283679         157764 206932 75754 8201 16624
      0.50  0.715935   0.176007 0.684149 0.279985         157922 203174 79512 7841 16984
      0.54  0.757576   0.192411 0.626465 0.294400         158005 217411 65275 9273 15552
      0.52  0.737092   0.183563 0.654542 0.286718         158031 210415 72271 8576 16249
      0.53  0.747333   0.187752 0.640322 0.290364         158059 213917 68769 8929 15896

Confusion matrix (optimal, threshold=0.51):
   TN=206932, FP=75754, FN=8201, TP=16624

Confusion matrix (baseline, threshold=0.50):
   TN=203174, FP=79512, FN=7841, TP=16984


## 8. Feature Importance - Permutation

In [7]:
print("\n=== FEATURE IMPORTANCE - PERMUTATION ===")

# S'assurer que le mod√®le est bien fit (cross_val_predict ne fit pas sur tout le train)
best_model.fit(X_train, y_train)

print("Calcul de l'importance par permutation...")
perm_importance = permutation_importance(
    best_model, X_train, y_train, 
    n_repeats=10, random_state=42, n_jobs=-1
)

perm_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': perm_importance.importances_mean,
    'std': perm_importance.importances_std
}).sort_values('importance', ascending=False)

print(f"\nTop 15 features par importance de permutation:")
print(perm_df.head(15).to_string(index=False))



=== FEATURE IMPORTANCE - PERMUTATION ===
Calcul de l'importance par permutation...

Top 15 features par importance de permutation:
                 feature  importance      std
               pos_count    0.010714 0.000194
         AMT_GOODS_PRICE    0.009259 0.000255
             AMT_ANNUITY    0.007746 0.000217
             CODE_GENDER    0.006308 0.000159
       installments_mean    0.005080 0.000122
  pos_future_installment    0.004771 0.000196
            EXT_SOURCE_2    0.004582 0.000444
            payment_mean    0.004124 0.000207
              DAYS_BIRTH    0.003397 0.000177
                     AGE    0.002890 0.000084
      NAME_FAMILY_STATUS    0.002672 0.000205
credit_card_balance_mean    0.002313 0.000257
      installments_count    0.002157 0.000040
            FLAG_OWN_CAR    0.001492 0.000195
            EXT_SOURCE_3    0.001266 0.000325


## 9. Feature Importance - SHAP

In [8]:
# (Optionnel) FEATURE IMPORTANCE - SHAP
try:
    import shap
    print("\n=== FEATURE IMPORTANCE - SHAP ===")
    print("Calcul des valeurs SHAP (cela peut prendre quelques minutes)...")
    sample_size = min(5000, len(X_train))
    sample_indices = np.random.choice(len(X_train), sample_size, replace=False)
    X_sample = X_train.iloc[sample_indices]
    if hasattr(best_model, 'named_steps') and 'preprocess' in best_model.named_steps and 'model' in best_model.named_steps:
        pre = best_model.named_steps['preprocess']
        lgb_model = best_model.named_steps['model']
        X_trans = pre.transform(X_sample)
        explainer = shap.TreeExplainer(lgb_model)
        shap_values = explainer.shap_values(X_trans)
    else:
        explainer = shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_sample)
    print(f"Valeurs SHAP calcul√©es sur {sample_size} √©chantillons")
    if isinstance(shap_values, list):
        shap_vals = shap_values[1]
    else:
        shap_vals = shap_values
    feature_importance_shap = np.abs(shap_vals).mean(axis=0)
    shap_df = pd.DataFrame({
        'feature_idx': np.arange(len(feature_importance_shap)),
        'shap_importance': feature_importance_shap
    }).sort_values('shap_importance', ascending=False)
    print(f"\nTop 15 features par SHAP importance:")
    print(shap_df.head(15).to_string(index=False))
except ImportError:
    print("SHAP n'est pas install√©. Importance SHAP non calcul√©e.")


=== FEATURE IMPORTANCE - SHAP ===
Calcul des valeurs SHAP (cela peut prendre quelques minutes)...
Valeurs SHAP calcul√©es sur 5000 √©chantillons

Top 15 features par SHAP importance:
 feature_idx  shap_importance
          28         0.333897
          29         0.313919
          27         0.147785
         108         0.120538
           4         0.095093
         166         0.077407
           3         0.077078
         143         0.072647
         119         0.066924
         123         0.061543
         136         0.057713
         134         0.056507
         132         0.055499
         171         0.052047
         120         0.046757


## 10. Sauvegarder le mod√®le final au Model Registry

In [9]:
print('\n=== FINAL RUN (MODEL + THRESHOLD + ARTIFACTS) ===')

perm_path = MODELS_DIR / 'feature_importance_permutation.csv'
perm_df.to_csv(perm_path, index=False)

with mlflow.start_run(run_name="lightgbm_final_threshold_optimized"):
    mlflow.sklearn.log_model(best_model, "model")
    mlflow.log_metric("min_business_cost", float(min_cost))
    mlflow.log_metric("optimal_threshold", float(best_threshold))
    mlflow.log_metric("cv_auc_oof", float(roc_auc_score(y_train, y_proba_cv)))
    mlflow.log_artifact(str(perm_path))

    mlflow.set_tag("candidate", "final")
    mlflow.set_tag("threshold_optimized", "true")
    mlflow.set_tag("feature_importance", "permutation")


=== FINAL RUN (MODEL + THRESHOLD + ARTIFACTS) ===




üèÉ View run lightgbm_final_threshold_optimized at: http://localhost:5000/#/experiments/1/runs/5968d7d15ff540b087d16b49a949980f
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 11. R√©sum√© final

In [10]:
print('\n' + '='*70)
print('R√âSUM√â: √âTAPE 4 - FEATURE IMPORTANCE & OPTIMISATION DU SEUIL')
print('='*70)

print('\n‚úÖ ANALYSES IMPL√âMENT√âES:')
print(f'\n1. OPTIMISATION DU SEUIL DE D√âCISION')
print(f'   ‚Ä¢ Seuils test√©s: de 0.01 √† 0.99 par pas de 0.01')
print(f'   ‚Ä¢ Optimisation bas√©e sur le co√ªt m√©tier')
print(f'   ‚Ä¢ Co√ªt m√©tier: FN*10 + FP*1')
print(f'   ‚Ä¢ Seuil optimal: {best_threshold:.2f}')
print(f'   ‚Ä¢ Co√ªt minimal: {min_cost:.0f}‚Ç¨')

print(f'\n2. FEATURE IMPORTANCE - PERMUTATION')
print(f'   ‚Ä¢ M√©thode: Diminution de l\'importance suite √† permutation')
print(f'   ‚Ä¢ 10 r√©p√©titions pour stabilit√©')
print(f'   ‚Ä¢ Top 15 features extraites')
print(f'   ‚Ä¢ Fichier: models/feature_importance_permutation.csv')

print(f'\n3. FEATURE IMPORTANCE - SHAP (optionnel)')
print(f'   ‚Ä¢ M√©thode: TreeExplainer (optimis√©e pour LightGBM)')
print(f'   ‚Ä¢ Calcul sur {min(5000, len(X_train)):,} √©chantillons')
print(f'   ‚Ä¢ Interpr√©tabilit√© du mod√®le am√©lior√©e')
print(f'   ‚Ä¢ Top 15 features par importance SHAP')

print(f'\n4. MODEL REGISTRY')
print(f'   ‚Ä¢ Meilleur mod√®le LightGBM sauvegard√©')
print(f'   ‚Ä¢ Seuil optimal: {best_threshold:.2f}')
print(f'   ‚Ä¢ Co√ªt minimal log√©: {min_cost:.0f}‚Ç¨')
print(f'   ‚Ä¢ Tags: candidate="final", threshold_optimized="true"')

print('\nüìä M√âTRIQUES FINALES:')
try:
    final_metrics = calculate_metrics_with_threshold(y_train, y_proba_cv, best_threshold)
    print(f'   ‚Ä¢ Accuracy: {final_metrics["accuracy"]:.4f}')
    print(f'   ‚Ä¢ Precision: {final_metrics["precision"]:.4f}')
    print(f'   ‚Ä¢ Recall: {final_metrics["recall"]:.4f}')
    print(f'   ‚Ä¢ F1-Score: {final_metrics["f1"]:.4f}')
    print(f'   ‚Ä¢ Matrice confusion: TP={final_metrics["tp"]}, TN={final_metrics["tn"]}, FP={final_metrics["fp"]}, FN={final_metrics["fn"]}')
except:
    print('   ‚Ä¢ Calcul des m√©triques finales...')

print('\nüéØ TOP 5 FEATURES PAR IMPORTANCE (PERMUTATION):')
print(perm_df.head(5).to_string(index=False))
try:
    print('\nüéØ TOP 5 FEATURES PAR IMPORTANCE (SHAP):')
    print(shap_df.head(5).to_string(index=False))
except:
    pass

print('\n' + '='*70)
print('‚úÖ √âTAPE 4 COMPL√àT√âE - MOD√àLE FINAL PR√äT POUR DEPLOYMENT')
print('='*70)

print('\nüìÅ FICHIERS G√âN√âR√âS:')
print(f'   ‚Ä¢ models/feature_importance_permutation.csv')
print(f'   ‚Ä¢ MLFlow Experiment: {experiment_name}')

print('\nüöÄ NEXT STEPS - DEPLOYMENT:')
print(f'   1. Batch scoring sur test set')
print(f'   2. API REST pour pr√©dictions en temps r√©el')
print(f'   3. Monitoring et retraining p√©riodique')
print(f'   4. Model versioning dans MLFlow Model Registry')


R√âSUM√â: √âTAPE 4 - FEATURE IMPORTANCE & OPTIMISATION DU SEUIL

‚úÖ ANALYSES IMPL√âMENT√âES:

1. OPTIMISATION DU SEUIL DE D√âCISION
   ‚Ä¢ Seuils test√©s: de 0.01 √† 0.99 par pas de 0.01
   ‚Ä¢ Optimisation bas√©e sur le co√ªt m√©tier
   ‚Ä¢ Co√ªt m√©tier: FN*10 + FP*1
   ‚Ä¢ Seuil optimal: 0.51
   ‚Ä¢ Co√ªt minimal: 157764‚Ç¨

2. FEATURE IMPORTANCE - PERMUTATION
   ‚Ä¢ M√©thode: Diminution de l'importance suite √† permutation
   ‚Ä¢ 10 r√©p√©titions pour stabilit√©
   ‚Ä¢ Top 15 features extraites
   ‚Ä¢ Fichier: models/feature_importance_permutation.csv

3. FEATURE IMPORTANCE - SHAP (optionnel)
   ‚Ä¢ M√©thode: TreeExplainer (optimis√©e pour LightGBM)
   ‚Ä¢ Calcul sur 5,000 √©chantillons
   ‚Ä¢ Interpr√©tabilit√© du mod√®le am√©lior√©e
   ‚Ä¢ Top 15 features par importance SHAP

4. MODEL REGISTRY
   ‚Ä¢ Meilleur mod√®le LightGBM sauvegard√©
   ‚Ä¢ Seuil optimal: 0.51
   ‚Ä¢ Co√ªt minimal log√©: 157764‚Ç¨
   ‚Ä¢ Tags: candidate="final", threshold_optimized="true"

üìä M√âTRIQUES F

## Prochaines √©tapes

- Batch scoring sur dataset de test
- API REST pour predictions en temps r√©el
- Monitoring et retraining