# Etape 4 - Feature Importance & Optimisation du Seuil

Objectif: Analyser l'importance des features avec SHAP et optimiser le seuil de d√©cision bas√© sur le co√ªt m√©tier.

## 1. Import et configuration

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from pathlib import Path
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# Paths
ROOT_DIR = Path('.').resolve().parent
DATA_DIR = ROOT_DIR / 'outputs'
MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print(f"Root: {ROOT_DIR}")
print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

Root: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps
Data: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\outputs
Models: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\models


## 2. Charger les donn√©es

In [2]:
print("Chargement des donnees...")
train = pd.read_csv(DATA_DIR / 'train_processed.csv')
test = pd.read_csv(DATA_DIR / 'test_processed.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Chargement des donnees...
Train shape: (307511, 148)
Test shape: (48744, 121)


## 3. Pr√©parer les donn√©es

In [3]:
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']

if 'TARGET' in test.columns:
    X_test = test.drop('TARGET', axis=1)
    y_test = test['TARGET']
else:
    X_test = test.copy()
    y_test = None

if 'SK_ID_CURR' in X_test.columns:
    test_ids = X_test['SK_ID_CURR'].copy()
    X_train = X_train.drop('SK_ID_CURR', axis=1, errors='ignore')
    X_test = X_test.drop('SK_ID_CURR', axis=1)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (307511, 146)
y_train shape: (307511,)


## 4. Configuration MLflow

In [4]:
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name = 'credit_scoring_v1'

try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment: {experiment_name}")

Experiment: credit_scoring_v1


## 5. Charger le meilleur mod√®le optimis√©

In [5]:
experiment = mlflow.get_experiment_by_name(experiment_name)
all_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

lgb_optimized = all_runs[all_runs['tags.mlflow.runName'] == 'lightgbm_gridsearch']

if len(lgb_optimized) > 0:
    best_run_id = lgb_optimized.iloc[0]['run_id']
    print(f"Best LightGBM run: {best_run_id}")
    
    model_uri = f"runs:/{best_run_id}/model"
    best_model = mlflow.sklearn.load_model(model_uri)
    print(f"Mod√®le charg√©: {type(best_model)}")
else:
    print("Aucun mod√®le LightGBM optimis√© trouv√©")

Best LightGBM run: 0e8fbdf54a8d45fc9083e0ec9579d281


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mod√®le charg√©: <class 'lightgbm.sklearn.LGBMClassifier'>


## 6. Fonction de m√©triques avec seuil personnalis√©

In [6]:
def calculate_metrics_with_threshold(y_true, y_proba, threshold, cost_fn=10, cost_fp=1):
    y_pred = (y_proba >= threshold).astype(int)
    
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    metrics['auc'] = roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['business_cost'] = fn * cost_fn + fp * cost_fp
    metrics['tn'] = int(tn)
    metrics['fp'] = int(fp)
    metrics['fn'] = int(fn)
    metrics['tp'] = int(tp)
    
    return metrics

print("Fonction avec seuil personnalis√©: OK")

Fonction avec seuil personnalis√©: OK


## 7. Optimisation du seuil de d√©cision

In [7]:
print("\n=== OPTIMISATION DU SEUIL ===")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_proba_cv = cross_val_predict(best_model, X_train, y_train, cv=skf, method='predict_proba')[:, 1]

thresholds = np.arange(0.1, 0.9, 0.05)
results = []

for threshold in thresholds:
    metrics = calculate_metrics_with_threshold(y_train, y_proba_cv, threshold)
    results.append({
        'threshold': threshold,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1'],
        'business_cost': metrics['business_cost']
    })

results_df = pd.DataFrame(results)

best_threshold = results_df.loc[results_df['business_cost'].idxmin(), 'threshold']
min_cost = results_df['business_cost'].min()

print(f"\nSeuil optimal (co√ªt minimal): {best_threshold:.2f}")
print(f"Co√ªt m√©tier minimal: {min_cost:.0f}‚Ç¨")
print(f"\nTop 5 thresholds par co√ªt:")
print(results_df.nsmallest(5, 'business_cost').to_string(index=False))


=== OPTIMISATION DU SEUIL ===

Seuil optimal (co√ªt minimal): 0.50
Co√ªt m√©tier minimal: 157859‚Ç¨

Top 5 thresholds par co√ªt:
 threshold  accuracy  precision   recall       f1  business_cost
      0.50  0.722989   0.178459 0.674723 0.282262         157859
      0.55  0.768763   0.197200 0.607090 0.297699         158894
      0.45  0.670743   0.161881 0.736959 0.265453         160020
      0.60  0.809834   0.220355 0.534099 0.311991         162572
      0.40  0.610967   0.146556 0.791782 0.247332         166153


## 8. Feature Importance - Permutation

In [8]:
print("\n=== FEATURE IMPORTANCE - PERMUTATION ===")

best_model.fit(X_train, y_train)

print("Calcul de l'importance par permutation...")
perm_importance = permutation_importance(
    best_model, X_train, y_train, 
    n_repeats=10, random_state=42, n_jobs=-1
)

perm_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': perm_importance.importances_mean,
    'std': perm_importance.importances_std
}).sort_values('importance', ascending=False)

print(f"\nTop 15 features par importance de permutation:")
print(perm_df.head(15).to_string(index=False))


=== FEATURE IMPORTANCE - PERMUTATION ===
Calcul de l'importance par permutation...

Top 15 features par importance de permutation:
               feature  importance      std
             pos_count    0.019698 0.000233
     installments_mean    0.012891 0.000160
            DAYS_BIRTH    0.011678 0.000146
pos_future_installment    0.011289 0.000219
          payment_mean    0.009213 0.000228
      installments_sum    0.008542 0.000146
       AMT_GOODS_PRICE    0.006712 0.000308
           CODE_GENDER    0.006560 0.000234
    installments_count    0.006519 0.000138
    previous_app_count    0.006100 0.000221
          EXT_SOURCE_1    0.005417 0.000374
          EXT_SOURCE_2    0.004530 0.000226
 credit_card_limit_sum    0.002761 0.000104
         DAYS_EMPLOYED    0.002031 0.000252
          EXT_SOURCE_3    0.002018 0.000450


## 9. Feature Importance - SHAP

In [9]:
print("\n=== FEATURE IMPORTANCE - SHAP ===")

print("Calcul des valeurs SHAP (cela peut prendre quelques minutes)...")

sample_size = min(5000, len(X_train))
sample_indices = np.random.choice(len(X_train), sample_size, replace=False)
X_sample = X_train.iloc[sample_indices]

explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_sample)

print(f"Valeurs SHAP calcul√©es sur {sample_size} √©chantillons")

if isinstance(shap_values, list):
    shap_vals = shap_values[1]
else:
    shap_vals = shap_values

feature_importance_shap = np.abs(shap_vals).mean(axis=0)
shap_df = pd.DataFrame({
    'feature': X_train.columns,
    'shap_importance': feature_importance_shap
}).sort_values('shap_importance', ascending=False)

print(f"\nTop 15 features par SHAP importance:")
print(shap_df.head(15).to_string(index=False))


=== FEATURE IMPORTANCE - SHAP ===
Calcul des valeurs SHAP (cela peut prendre quelques minutes)...
Valeurs SHAP calcul√©es sur 5000 √©chantillons

Top 15 features par SHAP importance:
               feature  shap_importance
          EXT_SOURCE_2         0.344688
          EXT_SOURCE_3         0.306874
          EXT_SOURCE_1         0.161831
       AMT_GOODS_PRICE         0.132442
           CODE_GENDER         0.125872
            AMT_CREDIT         0.118006
         DAYS_EMPLOYED         0.103186
             pos_count         0.103061
   NAME_EDUCATION_TYPE         0.100994
pos_future_installment         0.094159
           AMT_ANNUITY         0.090659
            DAYS_BIRTH         0.089482
          payment_mean         0.085007
      bureau_debt_mean         0.079684
           payment_sum         0.079625


## 10. Sauvegarder le mod√®le final au Model Registry

In [10]:
print('\n=== MODEL REGISTRY ===')

with mlflow.start_run(run_name='final_model_with_threshold'):
    mlflow.sklearn.log_model(best_model, 'model')
    
    mlflow.log_param('optimal_threshold', best_threshold)
    mlflow.log_metric('min_business_cost', min_cost)
    
    perm_df.to_csv(MODELS_DIR / 'feature_importance_permutation.csv', index=False)
    mlflow.log_artifact(str(MODELS_DIR / 'feature_importance_permutation.csv'))
    
    mlflow.set_tag('model_type', 'LightGBM')
    mlflow.set_tag('threshold_optimized', 'True')
    mlflow.set_tag('stage', 'production')
    
    print(f'Mod√®le final sauvegard√© avec seuil optimal: {best_threshold:.2f}')


=== MODEL REGISTRY ===




Mod√®le final sauvegard√© avec seuil optimal: 0.50
üèÉ View run final_model_with_threshold at: http://localhost:5000/#/experiments/1/runs/e3d6e8a07ba74c59bca07cf3e6c54bf0
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 11. R√©sum√© final

In [11]:
print('\n=== R√âSUM√â FINAL ===')
print(f'‚úÖ Meilleur mod√®le: LightGBM')
print(f'‚úÖ Seuil optimal: {best_threshold:.2f}')
print(f'‚úÖ Co√ªt m√©tier minimal: {min_cost:.0f}‚Ç¨')
print(f'‚úÖ Top 5 features (SHAP):')
print(shap_df.head(5).to_string(index=False))
print(f'\n‚úÖ Tous les runs logg√©s dans MLflow (experiment: {experiment_name})')
print(f'‚úÖ Mod√®le pr√™t pour deployment')


=== R√âSUM√â FINAL ===
‚úÖ Meilleur mod√®le: LightGBM
‚úÖ Seuil optimal: 0.50
‚úÖ Co√ªt m√©tier minimal: 157859‚Ç¨
‚úÖ Top 5 features (SHAP):
        feature  shap_importance
   EXT_SOURCE_2         0.344688
   EXT_SOURCE_3         0.306874
   EXT_SOURCE_1         0.161831
AMT_GOODS_PRICE         0.132442
    CODE_GENDER         0.125872

‚úÖ Tous les runs logg√©s dans MLflow (experiment: credit_scoring_v1)
‚úÖ Mod√®le pr√™t pour deployment


## Prochaines √©tapes

- Batch scoring sur dataset de test
- API REST pour predictions en temps r√©el
- Monitoring et retraining