In [57]:
import pandas as pd
import numpy as np
import mlflow.sklearn
import mlflow.lightgbm
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from src.metrics import business_cost, find_optimal_threshold

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Credit_Scoring_Optimization")

<Experiment: artifact_location='file:E:/mlruns/242745646389284014', creation_time=1765371971797, experiment_id='242745646389284014', last_update_time=1765371971797, lifecycle_stage='active', name='Credit_Scoring_Optimization', tags={}>

In [58]:
from src.model_utils import load_data, prepare_data

df = load_data("../data/processed/train_final.csv")
X_train, X_val, y_train, y_val, scaler = prepare_data(df)

1 - REGRESSION LOGISTIQUE

In [59]:
pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, random_state=42))
])

run_name = "Logistic_Regression_Baseline"

with mlflow.start_run(run_name=run_name):

    pipeline_lr.fit(X_train, y_train)

    y_proba_val = pipeline_lr.predict_proba(X_val)[:, 1]

    auc_score = roc_auc_score(y_val, y_proba_val)
    optimal_threshold, min_cost = find_optimal_threshold(y_val, y_proba_val, fn_cost=10, fp_cost=1)

    print(f"AUC : {auc_score:.4f}")
    print(f"Meilleur Seuil : {optimal_threshold:.2f}")
    print(f"Co√ªt M√©tier : {min_cost:.4f}")

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("imputer", "median")

    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("business_cost", min_cost)
    mlflow.log_metric("optimal_threshold", optimal_threshold)

    mlflow.sklearn.log_model(pipeline_lr, "model")



AUC : 0.7589
Meilleur Seuil : 0.54
Co√ªt M√©tier : 0.5250


2 - RANDOM FOREST

In [60]:
pipeline_rf = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,       # Limit√© pour garder le mod√®le l√©ger (Docker)
        class_weight='balanced', # Gestion du d√©s√©quilibre
        n_jobs=-1,          # Utilise tous les coeurs du PC
        random_state=42
    ))
])

run_name = "Random_Forest_V1"

with mlflow.start_run(run_name=run_name):
    pipeline_rf.fit(X_train, y_train)

    y_proba_val = pipeline_rf.predict_proba(X_val)[:, 1]

    auc_score = roc_auc_score(y_val, y_proba_val)
    optimal_threshold, min_cost = find_optimal_threshold(y_val, y_proba_val, fn_cost=10, fp_cost=1)

    print(f"AUC : {auc_score:.4f}")
    print(f"Meilleur Seuil : {optimal_threshold:.2f}")
    print(f"Co√ªt M√©tier : {min_cost:.4f}")

    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("class_weight", "balanced")

    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("business_cost", min_cost)
    mlflow.log_metric("optimal_threshold", optimal_threshold)

    mlflow.sklearn.log_model(pipeline_rf, "model")



AUC : 0.7460
Meilleur Seuil : 0.46
Co√ªt M√©tier : 0.5467


3 - XGBOOST

In [61]:
import xgboost as xgb
# Calcul du ratio pour g√©rer le d√©s√©quilibre (scale_pos_weight)
# Formule : (Nombre de N√©gatifs) / (Nombre de Positifs)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

model_xgb = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    scale_pos_weight=ratio,  # desequilibre
    n_jobs=-1,
    random_state=42,
    tree_method='hist' #opti pour aller plus vite
)

run_name = "XGBoost_V1"

with mlflow.start_run(run_name=run_name):
    model_xgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )

    y_proba_val = model_xgb.predict_proba(X_val)[:, 1]

    auc_score = roc_auc_score(y_val, y_proba_val)
    optimal_threshold, min_cost = find_optimal_threshold(y_val, y_proba_val, fn_cost=10, fp_cost=1)

    print(f"AUC : {auc_score:.4f}")
    print(f"Meilleur Seuil : {optimal_threshold:.2f}")
    print(f"Co√ªt M√©tier : {min_cost:.4f}")

    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("scale_pos_weight", ratio)

    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("business_cost", min_cost)
    mlflow.log_metric("optimal_threshold", optimal_threshold)

    mlflow.xgboost.log_model(model_xgb, "model")

üöÄ D√©marrage du run : XGBoost_V1
[0]	validation_0-logloss:0.68536
[100]	validation_0-logloss:0.50666
[200]	validation_0-logloss:0.46894
[300]	validation_0-logloss:0.44208
[400]	validation_0-logloss:0.42050
[500]	validation_0-logloss:0.40091
[600]	validation_0-logloss:0.38315
[700]	validation_0-logloss:0.36862
[800]	validation_0-logloss:0.35492
[900]	validation_0-logloss:0.34257
[999]	validation_0-logloss:0.33218




AUC : 0.7670
Meilleur Seuil : 0.25
Co√ªt M√©tier : 0.5164


4 - MLP

In [62]:
from sklearn.neural_network import MLPClassifier


pipeline_mlp = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Bouche les trous
    ('scaler', StandardScaler()),
    ('model', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.0001,
        batch_size='auto',
        learning_rate='adaptive',
        max_iter=200,
        early_stopping=True,
        random_state=42,
        verbose=True
    ))
])

run_name = "MLP_NeuralNetwork"

with mlflow.start_run(run_name=run_name):

    pipeline_mlp.fit(X_train, y_train)

    y_proba_val = pipeline_mlp.predict_proba(X_val)[:, 1]

    auc_score = roc_auc_score(y_val, y_proba_val)
    optimal_threshold, min_cost = find_optimal_threshold(y_val, y_proba_val, fn_cost=10, fp_cost=1)

    print(f"AUC : {auc_score:.4f}")
    print(f"Meilleur Seuil : {optimal_threshold:.2f}")
    print(f"Co√ªt M√©tier : {min_cost:.4f}")

    mlflow.log_param("model_type", "MLP")
    mlflow.log_param("architecture", "(100, 50)")

    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("business_cost", min_cost)
    mlflow.log_metric("optimal_threshold", optimal_threshold)

    mlflow.sklearn.log_model(pipeline_mlp, "model")

Iteration 1, loss = 0.25922944
Validation score: 0.919272
Iteration 2, loss = 0.24628502
Validation score: 0.919353
Iteration 3, loss = 0.24201895
Validation score: 0.918865
Iteration 4, loss = 0.23783653
Validation score: 0.919109
Iteration 5, loss = 0.23423431
Validation score: 0.917930
Iteration 6, loss = 0.22985029
Validation score: 0.916873
Iteration 7, loss = 0.22543965
Validation score: 0.916345
Iteration 8, loss = 0.22098887
Validation score: 0.915694
Iteration 9, loss = 0.21638571
Validation score: 0.915247
Iteration 10, loss = 0.21224151
Validation score: 0.915003
Iteration 11, loss = 0.20815481
Validation score: 0.913134
Iteration 12, loss = 0.20427021
Validation score: 0.909719
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.




AUC : 0.7450
Meilleur Seuil : 0.09
Co√ªt M√©tier : 0.5437


5 - LIGHTGBM

In [63]:
import pickle

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1
}

run_name = "LGBM_Full_Features"

with mlflow.start_run(run_name=run_name):
    print(f"üöÄ D√©marrage du run : {run_name}")

    model = lgb.LGBMClassifier(**params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )

    y_proba_val = model.predict_proba(X_val)[:, 1]

    auc_score = roc_auc_score(y_val, y_proba_val)
    optimal_threshold, min_cost = find_optimal_threshold(y_val, y_proba_val, fn_cost=10, fp_cost=1)

    print(f"AUC : {auc_score:.4f}")
    print(f"Meilleur Seuil : {optimal_threshold:.2f}")
    print(f"Co√ªt M√©tier Minimum : {min_cost:.4f}")

    mlflow.log_params(params)
    mlflow.log_param("class_weight", "is_unbalance=True")

    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("optimal_threshold", optimal_threshold)
    mlflow.log_metric("business_cost", min_cost)

    mlflow.sklearn.log_model(model, "model")

with open(os.path.join('../model', "model.pkl"), "wb") as f:
        pickle.dump(model, f)

with open(os.path.join('../model', "threshold.txt"), "w") as f:
        f.write(str(optimal_threshold))



üöÄ D√©marrage du run : LGBM_Full_Features
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.800058	valid_1's auc: 0.775954
[200]	training's auc: 0.823471	valid_1's auc: 0.782843
[300]	training's auc: 0.840879	valid_1's auc: 0.784492
[400]	training's auc: 0.85582	valid_1's auc: 0.785193
[500]	training's auc: 0.868851	valid_1's auc: 0.785647
Early stopping, best iteration is:
[493]	training's auc: 0.868081	valid_1's auc: 0.785766




AUC : 0.7858
Meilleur Seuil : 0.53
Co√ªt M√©tier Minimum : 0.4895


In [64]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import lightgbm as lgb
import mlflow

# On donne une plage de valeurs possibles pour chaque param√®tre
search_space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 15, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 20, 150, 1)),
    'subsample': hp.uniform('subsample',    0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}

# 2. Fonction Objectif (Ce que l'algo doit optimiser)
# On initialise le compteur hors de la fonction
trial_counter = 0

def objective(params):
    global trial_counter
    trial_counter += 1

    run_name = f"Optim_Trial_{trial_counter:02d}"

    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['is_unbalance'] = True
    params['n_jobs'] = -1
    params['random_state'] = 42
    params['verbose'] = -1

    with mlflow.start_run(nested=True, run_name=run_name):

        # 1. Entra√Ænement
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(30, verbose=False)])

        # 2. Score
        y_proba = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, y_proba)

        # 3. Logging
        mlflow.log_params(params)
        mlflow.log_metric("auc", score)

        # On peut aussi ajouter le num√©ro d'essai en tag
        mlflow.set_tag("trial_number", trial_counter)

        return {'loss': -score, 'status': STATUS_OK, 'params': params}

trial_counter = 0
trials = Trials()

best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,  # 20 essais (Mettez 50 si vous avez du temps, environ 15-20 min)
    trials=trials,
    rstate=np.random.default_rng(42)
)

print("Meilleurs hyperparam√®tres trouv√©s :")
print(best_params)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]




  5%|‚ñå         | 1/20 [00:16<05:08, 16.24s/trial, best loss: -0.7774354615101351]




 10%|‚ñà         | 2/20 [00:38<05:53, 19.61s/trial, best loss: -0.7790838355310193]




 15%|‚ñà‚ñå        | 3/20 [00:58<05:37, 19.85s/trial, best loss: -0.7790838355310193]




 20%|‚ñà‚ñà        | 4/20 [01:26<06:07, 22.95s/trial, best loss: -0.7790838355310193]




 25%|‚ñà‚ñà‚ñå       | 5/20 [01:47<05:33, 22.25s/trial, best loss: -0.7790838355310193]




 30%|‚ñà‚ñà‚ñà       | 6/20 [02:21<06:08, 26.33s/trial, best loss: -0.7790838355310193]




 35%|‚ñà‚ñà‚ñà‚ñå      | 7/20 [02:50<05:52, 27.13s/trial, best loss: -0.7790838355310193]




 40%|‚ñà‚ñà‚ñà‚ñà      | 8/20 [03:13<05:12, 26.04s/trial, best loss: -0.7840292924574395]




 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 9/20 [03:40<04:48, 26.21s/trial, best loss: -0.7840292924574395]




 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 10/20 [04:09<04:29, 26.97s/trial, best loss: -0.7840292924574395]




 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 11/20 [04:33<03:54, 26.05s/trial, best loss: -0.7840292924574395]




 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 12/20 [04:54<03:16, 24.60s/trial, best loss: -0.7840292924574395]




 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 13/20 [05:11<02:37, 22.51s/trial, best loss: -0.7840292924574395]




 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 14/20 [05:28<02:04, 20.71s/trial, best loss: -0.7840292924574395]




 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 15/20 [05:45<01:38, 19.62s/trial, best loss: -0.7840292924574395]




 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 16/20 [05:59<01:11, 17.86s/trial, best loss: -0.7840292924574395]




 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 17/20 [06:19<00:55, 18.39s/trial, best loss: -0.7840292924574395]




 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 18/20 [06:33<00:34, 17.32s/trial, best loss: -0.7840292924574395]




 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 19/20 [06:54<00:18, 18.44s/trial, best loss: -0.7840292924574395]




100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [07:12<00:00, 21.61s/trial, best loss: -0.7840292924574395]
Meilleurs hyperparam√®tres trouv√©s :
{'colsample_bytree': np.float64(0.6669925735866169), 'learning_rate': np.float64(0.08867395059676818), 'max_depth': np.float64(12.0), 'min_child_weight': np.float64(1.3981015914424932), 'num_leaves': np.float64(64.0), 'reg_alpha': np.float64(0.7480357729604008), 'reg_lambda': np.float64(0.8300896323724805), 'subsample': np.float64(0.5321434316547746)}


In [66]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

# 1. On pr√©pare le d√©coupage en 5 parts (Stratified = garde le % de d√©fauts constant)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
cost_scores = []

# On a besoin de X et y sous forme d'array numpy pour les index
X_cv = X_train  # On travaille sur le jeu d'entra√Ænement global
y_cv = y_train.values if isinstance(y_train, pd.Series) else y_train

# 2. La boucle d'entra√Ænement (5 tours)
for fold_id, (train_idx, val_idx) in enumerate(cv.split(X_cv, y_cv)):

    # Cr√©ation des jeux de donn√©es pour ce tour
    X_tr_fold, y_tr_fold = X_cv[train_idx], y_cv[train_idx]
    X_val_fold, y_val_fold = X_cv[val_idx], y_cv[val_idx]

    # On entra√Æne un nouveau mod√®le √† chaque fois (avec les meilleurs params trouv√©s)
    model_cv = lgb.LGBMClassifier(**final_params)

    model_cv.fit(
        X_tr_fold, y_tr_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        callbacks=[lgb.early_stopping(30, verbose=False)]
    )

    # √âvaluation
    y_proba_fold = model_cv.predict_proba(X_val_fold)[:, 1]

    # Calcul AUC
    auc = roc_auc_score(y_val_fold, y_proba_fold)
    auc_scores.append(auc)

    # Calcul Co√ªt M√©tier (avec le seuil optimis√© qu'on a trouv√© avant)
    y_pred_fold = (y_proba_fold >= optimal_threshold).astype(int)
    cost = business_cost(y_val_fold, y_pred_fold, fn_cost=10, fp_cost=1)
    cost_scores.append(cost)

    print(f"   -> Fold {fold_id+1}/5 : AUC={auc:.4f} | Co√ªt={cost:.4f}")

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
mean_cost = np.mean(cost_scores)

print(f"   AUC Moyen  : {mean_auc:.4f} (+/- {std_auc:.4f})")
print(f"   Co√ªt Moyen : {mean_cost:.4f}")

if std_auc < 0.02:
    print("mod√®le stable, faible variance")
else:
    print("mod√®le instable, forte variance entre les folds")

üîÑ D√©marrage de la Validation Crois√©e (5 Folds)...




   -> Fold 1/5 : AUC=0.7742 | Co√ªt=0.5072




   -> Fold 2/5 : AUC=0.7740 | Co√ªt=0.5085




   -> Fold 3/5 : AUC=0.7791 | Co√ªt=0.4994




   -> Fold 4/5 : AUC=0.7799 | Co√ªt=0.5018
   -> Fold 5/5 : AUC=0.7759 | Co√ªt=0.5107
   AUC Moyen  : 0.7766 (+/- 0.0024)
   Co√ªt Moyen : 0.5055
mod√®le stable, faible variance




In [69]:
# 4. Entra√Ænement du Mod√®le ULTIME avec les param√®tres optimis√©s
final_params = best_params.copy()
# On remet les param√®tres fixes qui ne sont pas dans best_params
final_params['objective'] = 'binary'
final_params['metric'] = 'auc'
final_params['is_unbalance'] = True
final_params['n_jobs'] = -1
final_params['random_state'] = 42
# On force les types entiers
final_params['max_depth'] = int(final_params['max_depth'])
final_params['num_leaves'] = int(final_params['num_leaves'])

model_final = lgb.LGBMClassifier(**final_params)

model_final.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50)])

y_proba_opt = model_final.predict_proba(X_val)[:, 1]

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.784029




In [70]:
import matplotlib.pyplot as plt
import numpy as np
import os

FIGURES_DIR = "../reports/figures"

thresholds = np.arange(0.0, 1.01, 0.01)
costs = []

for thresh in thresholds:
    y_pred = (y_proba_opt >= thresh).astype(int)
    # Rappel : business_cost doit √™tre import√© ou d√©fini
    costs.append(business_cost(y_val, y_pred, fn_cost=10, fp_cost=1))

plt.figure(figsize=(10, 6))
plt.plot(thresholds, costs, label="Co√ªt M√©tier Total", color='blue', linewidth=2)

# On marque le point optimal (le plus bas)
min_cost_idx = np.argmin(costs)
opt_thresh = thresholds[min_cost_idx]
min_cost = costs[min_cost_idx]

plt.axvline(opt_thresh, color='red', linestyle='--', label=f'Seuil Optimal : {opt_thresh:.2f}')
plt.scatter(opt_thresh, min_cost, color='red', s=100, zorder=5)

plt.xlabel('Seuil de Probabilit√© (Threshold)')
plt.ylabel('Co√ªt M√©tier (Normalis√©)')
plt.title('Minimisation du Co√ªt M√©tier (FN=10, FP=1)')
plt.legend()
plt.grid(True, alpha=0.3)

save_path = os.path.join(FIGURES_DIR, "courbe_cout_vs_seuil.png")
plt.savefig(save_path, bbox_inches='tight', dpi=300)
plt.close()
