In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, make_scorer

# Configuration de MLflow
# On donne un nom à l'expérience pour s'y retrouver
mlflow.set_experiment("Credit_Scoring_Exp")

print("MLflow initialisé !")

  return FileStore(store_uri, store_uri)
2025/12/05 08:53:47 INFO mlflow.tracking.fluent: Experiment with name 'Credit_Scoring_Exp' does not exist. Creating a new experiment.


MLflow initialisé !


In [2]:
# On charge les pickles qu'on a sauvegardés à la fin du notebook 01
X_train = pd.read_pickle('../data/X_train.pkl')
X_test = pd.read_pickle('../data/X_test.pkl')
y_train = pd.read_pickle('../data/y_train.pkl')
y_test = pd.read_pickle('../data/y_test.pkl')

print(f"Train : {X_train.shape}, Test : {X_test.shape}")

Train : (246008, 281), Test : (61503, 281)


In [3]:
def business_cost_metric(y_true, y_pred_proba, threshold=0.5):

    # Convertir probas en classes 0/1 selon le seuil
    y_pred = (y_pred_proba > threshold).astype(int)
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Formule du coût
    cost = 10 * fn + 1 * fp
    
    # On normalise pour avoir un score comparable (optionnel, mais pratique)
    # Plus c'est bas, mieux c'est.
    return cost

In [24]:
mlflow.autolog()
# 1. Création du modèle "Idiot" (Stratégie: prédit la classe majoritaire)
dummy = DummyClassifier(strategy='most_frequent')

# 2. Entraînement
dummy.fit(X_train, y_train)

# 3. Prédictions (Probas)
# Le dummy va mettre 0 partout ou des probas basiques
y_prob = dummy.predict_proba(X_test)[:, 1]

# 4. Calcul des Métriques
auc_score = roc_auc_score(y_test, y_prob)
cost_score = business_cost_metric(y_test, y_prob)

print(f"Dummy AUC: {auc_score:.4f}")     # Devrait être 0.5
print(f"Dummy Cost: {cost_score}")       # Sera élevé car il rate tous les défauts (FN)

# 5. LOGGING MLFLOW (C'est ça qu'on veut !)
# On enregistre les paramètres (quel algo ?)
mlflow.log_param("model_type", "DummyClassifier")
mlflow.log_param("strategy", "most_frequent")

# On enregistre les performances
mlflow.log_metric("auc", auc_score)
mlflow.log_metric("business_cost", cost_score)

# On enregistre le modèle lui-même (le fichier)
mlflow.sklearn.log_model(dummy, "model")

print("Run logged to MLflow!")

2025/12/05 10:46:04 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/12/05 10:46:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/12/05 10:46:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1a5a4732b05d487798aa2473d612d962', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Dummy AUC: 0.5000
Dummy Cost: 49650




Run logged to MLflow!


In [5]:
# Gestion des caractères spéciaux dans les noms de colonnes (pour LightGBM)
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]

In [6]:
# modele LightGBM avec gestion du déséquilibre sur le dataset v1
import lightgbm as lgb

start_time = pd.Timestamp.now()
# class_weight='balanced' gère automatiquement le déséquilibre 92/8
lgbm = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("Entraînement LightGBM en cours...")
lgbm.fit(X_train, y_train)
end_time = pd.Timestamp.now()
duration = (end_time - start_time).total_seconds()
y_prob = lgbm.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)
cost_score = business_cost_metric(y_test, y_prob)

print(f"LGBM AUC: {auc_score:.2f}")   
print(f"LGBM Cost: {cost_score}")    
print(f"Training Duration: {duration:.2f} seconds")

mlflow.end_run()

Entraînement LightGBM en cours...
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16615
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 260
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LGBM AUC: 0.78
LGBM Cost: 31090
Training Duration: 15.46 seconds


In [7]:
auc_score = roc_auc_score(y_test, y_prob)
cost_score = business_cost_metric(y_test, y_prob)
print(f"LGBM AUC: {auc_score:.4f}")
print(f"LGBM Cost: {cost_score}")

LGBM AUC: 0.7789
LGBM Cost: 31090


In [15]:
# on recoupe le train pour faire un jeu de validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
X_train.shape


(196806, 281)

In [16]:
from lightgbm import log_evaluation, early_stopping
start_time = pd.Timestamp.now()

clf = lgb.LGBMClassifier(nthread=-1,
                            n_estimators=5000,
                            learning_rate=0.01,
                            max_depth=11,
                            num_leaves=58,
                            colsample_bytree=0.613,
                            subsample=0.708,
                            max_bin=407,
                            reg_alpha=3.564,
                            reg_lambda=4.930,
                            min_child_weight=6,
                            min_child_samples=165,
                            class_weight='balanced'
                            )

clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='auc', callbacks=[
        log_evaluation(500),    # Remplace verbose=500
        early_stopping(500)     # Remplace early_stopping_rounds=500
    ])
end_time = pd.Timestamp.now()
duration = (end_time - start_time).total_seconds()


[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24868
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 245
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.820473	training's binary_logloss: 0.531943	valid_1's auc: 0.769292	valid_1's binary_logloss: 0.552079
[1000]	training's auc: 0.855057	training's binary_logloss: 0.490501	valid_1's auc: 0.7762	valid_1's binary_logloss: 0.524969
[1500]	training's auc: 0.879841	training's binary_logloss: 0.461497	valid_1's auc: 0.777314	valid_1's binary_logloss: 0.506896
[2000]	training's auc: 0.900139	training's binary_logloss: 0.436

In [17]:
y_prob = clf.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)
cost_score = business_cost_metric(y_test, y_prob)

print(f"LGBM AUC: {auc_score:.4f}")   
print(f"LGBM Cost: {cost_score}")    
print(f"Training Duration: {duration:.2f} seconds")

LGBM AUC: 0.7816
LGBM Cost: 30720
Training Duration: 96.23 seconds


In [20]:
df_v2 = pd.read_pickle('../data/train_v2.pkl')
X2 = df_v2.drop(columns=['TARGET'])
y2 = df_v2['TARGET']
#split en 70/15/15
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.3, random_state=42, stratify=y2)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42, stratify=y2_temp)
X2_train.shape


(215257, 84)

In [21]:
# test avec le dataset v2
from lightgbm import log_evaluation, early_stopping
start_time = pd.Timestamp.now()

clf = lgb.LGBMClassifier(nthread=-1,
                            n_estimators=5000,
                            learning_rate=0.01,
                            max_depth=11,
                            num_leaves=58,
                            colsample_bytree=0.613,
                            subsample=0.708,
                            max_bin=407,
                            reg_alpha=3.564,
                            reg_lambda=4.930,
                            min_child_weight=6,
                            min_child_samples=165,
                            class_weight='balanced'
                            )

clf.fit(X2_train, y2_train, eval_set=[(X2_train, y2_train), (X2_val, y2_val)], eval_metric='auc', callbacks=[
        log_evaluation(500),    # Remplace verbose=500
        early_stopping(500)     # Remplace early_stopping_rounds=500
    ])
end_time = pd.Timestamp.now()
duration = (end_time - start_time).total_seconds()


[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16810
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.816299	training's binary_logloss: 0.533125	valid_1's auc: 0.774139	valid_1's binary_logloss: 0.547687
[1000]	training's auc: 0.849534	training's binary_logloss: 0.494404	valid_1's auc: 0.779361	valid_1's binary_logloss: 0.521827
[1500]	training's auc: 0.873485	training's binary_logloss: 0.466765	valid_1's auc: 0.780327	valid_1's binary_logloss: 0.504176
Early stopping, best iteration is:
[1473]	training's auc: 0.8

In [22]:
y2_prob = clf.predict_proba(X2_test)[:, 1]

auc_score = roc_auc_score(y2_test, y2_prob)
cost_score = business_cost_metric(y2_test, y2_prob)

print(f"LGBM AUC: {auc_score:.4f}")   
print(f"LGBM Cost: {cost_score}")    
print(f"Training Duration: {duration:.2f} seconds")

LGBM AUC: 0.7814
LGBM Cost: 23254
Training Duration: 60.04 seconds


In [23]:
# optimisation avec optuna sur le v2
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def objective(trial):
    # 1. Définition de l'espace de recherche (Hyperparameters)
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'class_weight': 'balanced', # Gère le déséquilibre
        'n_jobs': -1,
        
        # Paramètres qu'Optuna va faire varier
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0)
    }

    model = lgb.LGBMClassifier(**param)
    
    callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=False)]
    
    model.fit(
        X2_train, y2_train,
        eval_set=[(X2_val, y2_val)],
        eval_metric='auc',
        callbacks=callbacks
    )
    
    # On prédit sur le set de VALIDATION pour qu'Optuna juge la qualité
    preds = model.predict_proba(X2_val)[:, 1]

    auc = roc_auc_score(y2_val, preds)
    
    return auc

# 4. Lancement de l'étude
print("Recherche des meilleurs hyperparamètres...")
study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=30)

print("-" * 30)
print(f"Meilleure AUC trouvée : {study.best_value:.4f}")
print("Meilleurs paramètres :")
print(study.best_params)

[I 2025-12-05 09:45:28,956] A new study created in memory with name: no-name-37863346-0df6-4a2e-9d14-f35990440a0f


Recherche des meilleurs hyperparamètres...


[I 2025-12-05 09:45:35,588] Trial 0 finished with value: 0.7789117706254458 and parameters: {'n_estimators': 3957, 'learning_rate': 0.04676931605058948, 'num_leaves': 43, 'max_depth': 8, 'min_child_samples': 44, 'reg_alpha': 9.61608283355706, 'reg_lambda': 5.188404512324313, 'colsample_bytree': 0.7715562991397976, 'subsample': 0.9371417105583603}. Best is trial 0 with value: 0.7789117706254458.
[I 2025-12-05 09:45:44,445] Trial 1 finished with value: 0.7756934727497279 and parameters: {'n_estimators': 750, 'learning_rate': 0.01540322686298918, 'num_leaves': 34, 'max_depth': 5, 'min_child_samples': 43, 'reg_alpha': 5.962406285776504, 'reg_lambda': 3.8244617278626714, 'colsample_bytree': 0.7596775866992767, 'subsample': 0.6872535225080543}. Best is trial 0 with value: 0.7789117706254458.
[I 2025-12-05 09:46:19,233] Trial 2 finished with value: 0.7739015711446432 and parameters: {'n_estimators': 1151, 'learning_rate': 0.005107373790345892, 'num_leaves': 46, 'max_depth': 9, 'min_child_samp

------------------------------
Meilleure AUC trouvée : 0.7811
Meilleurs paramètres :
{'n_estimators': 1658, 'learning_rate': 0.012535077735683063, 'num_leaves': 70, 'max_depth': 10, 'min_child_samples': 59, 'reg_alpha': 4.623067356388232, 'reg_lambda': 5.412850454194109, 'colsample_bytree': 0.4521696666688364, 'subsample': 0.4000189134569926}
