# 02. Entraînement des Modèles

Ce notebook charge les données préparées, effectue un split Train/Validation/Test (70/15/15) et entraîne plusieurs modèles (Dummy, Logistic Regression, Random Forest, XGBoost, LightGBM) en logguant les résultats dans MLflow.

In [None]:
import pandas as pd
import numpy as np
import mlflow
import sys
import os

# Ajout du chemin src
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path: sys.path.append(project_root)

from src.model_utils import (
    get_train_val_test_split,
    train_dummy,
    train_logistic_regression,
    train_random_forest,
    train_xgboost,
    train_lightgbm
)

# Configuration MLflow
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("Credit_Scoring")

In [None]:
# Chargement des données (v1 pour commencer)
X = pd.read_pickle('../data/processed/X_prepared_v1.pkl')
y = pd.read_pickle('../data/processed/y_prepared_v1.pkl')

# Nettoyage des noms de colonnes pour LightGBM/XGBoost
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]

print(f"Shape X: {X.shape}, Shape y: {y.shape}")

In [None]:
# Split des données 70/15/15
X_train, y_train, X_val, y_val, X_test, y_test = get_train_val_test_split(X, y)

## 1. Dummy Classifier (Baseline)

In [None]:
model_dummy, metrics_dummy = train_dummy(X_train, y_train, X_val, y_val, X_test, y_test)

## 2. Logistic Regression

In [None]:
model_lr, metrics_lr = train_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test)

## 3. Random Forest

In [None]:
model_rf, metrics_rf = train_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)

## 4. XGBoost

In [None]:
model_xgb, metrics_xgb = train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)

## 5. LightGBM

In [None]:
model_lgbm, metrics_lgbm = train_lightgbm(X_train, y_train, X_val, y_val, X_test, y_test)

## Résumé des performances

In [None]:
results = pd.DataFrame([
    {"Model": "Dummy", **metrics_dummy},
    {"Model": "Logistic Regression", **metrics_lr},
    {"Model": "Random Forest", **metrics_rf},
    {"Model": "XGBoost", **metrics_xgb},
    {"Model": "LightGBM", **metrics_lgbm}
])
results.set_index("Model", inplace=True)
display(results)