# Etape 2 - MLflow Tracking & Entrainement des Modeles

Objectif: Entra√Æner et comparer plusieurs mod√®les avec tracking MLflow pour l'exp√©rimentation.

## 1. Import et configuration

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from pathlib import Path
import warnings
import json
import time

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, 
    recall_score, f1_score, confusion_matrix
)
import xgboost as xgb
import lightgbm as lgb

# Import TensorFlow for MLP
try:
    import tensorflow as tf
    from tensorflow import keras
    TENSORFLOW_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è  TensorFlow non disponible - le mod√®le MLP sera skipp√©")
    TENSORFLOW_AVAILABLE = False

warnings.filterwarnings('ignore')

# Paths
ROOT_DIR = Path('.').resolve().parent
DATA_DIR = ROOT_DIR / 'outputs'
MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print(f"Root: {ROOT_DIR}")
print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

‚ö†Ô∏è  TensorFlow non disponible - le mod√®le MLP sera skipp√©
Root: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps
Data: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\outputs
Models: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\models


## 2. Charger les donnees

In [2]:
print("Chargement des donnees...")

train = pd.read_parquet(DATA_DIR / "train_processed.parquet")
test = pd.read_parquet(DATA_DIR / "test_processed.parquet")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Verifier TARGET
print(f"\n=== DISTRIBUTION TARGET ===")
target_counts = train['TARGET'].value_counts()
print(target_counts)
print(f"\nPourcentages:")
print(train['TARGET'].value_counts(normalize=True) * 100)
print(f"\nRatio d√©s√©quilibre: {target_counts[0] / target_counts[1]:.2f}:1")

Chargement des donnees...
Train shape: (307511, 148)
Test shape: (48744, 121)

=== DISTRIBUTION TARGET ===
TARGET
0    282686
1     24825
Name: count, dtype: int64

Pourcentages:
TARGET
0    91.927118
1     8.072882
Name: proportion, dtype: float64

Ratio d√©s√©quilibre: 11.39:1


## 3. Preparer les donnees

In [3]:
y_train = train["TARGET"].copy()
X_train = train.drop(columns=["TARGET"]).copy()

test_ids = None
if "SK_ID_CURR" in X_train.columns:
    X_train = X_train.drop(columns=["SK_ID_CURR"])
if "SK_ID_CURR" in test.columns:
    test_ids = test["SK_ID_CURR"].copy()
    X_test = test.drop(columns=["SK_ID_CURR"]).copy()
else:
    X_test = test.copy()

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (307511, 146)
y_train shape: (307511,)
X_test shape: (48744, 120)


## 4. Configuration MLflow

In [4]:
# Configurer MLflow
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name = 'credit_scoring_v1'

# Creer experiment si elle n'existe pas
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment: {experiment_name}")
print(f"Experiment ID: {experiment_id}")

Experiment: credit_scoring_v1
Experiment ID: 1


## 5. Fonction de metriques

In [5]:
def calculate_metrics(y_true, y_pred, y_pred_proba=None, cost_fn=10, cost_fp=1):
    """
    Calcule les metriques de classification.
    
    Args:
        y_true: Labels vrais
        y_pred: Predictions (0/1)
        y_pred_proba: Probabilites (pour AUC)
        cost_fn: Cout d'un Faux Negatif (defaut: 10)
        cost_fp: Cout d'un Faux Positif (defaut: 1)
    """
    metrics = {}
    
    # Metriques standards
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    
    # AUC (si probabilites disponibles)
    if y_pred_proba is not None:
        metrics['auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    # Co√ªt m√©tier
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['business_cost'] = fn * cost_fn + fp * cost_fp
    metrics['tn'] = int(tn)
    metrics['fp'] = int(fp)
    metrics['fn'] = int(fn)
    metrics['tp'] = int(tp)
    
    return metrics

print("Fonction de metriques: OK")

Fonction de metriques: OK


## 6. Entrainement Logistic Regression

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                          ("scaler", StandardScaler())]), num_cols),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ],
    remainder="drop"
 )

print("\n=== LOGISTIC REGRESSION ===")
with mlflow.start_run(run_name="logistic_regression_baseline"):
    start_time = time.time()

    params = {
        "max_iter": 1000,
        "random_state": 42,
        "solver": "lbfgs",
        "class_weight": "balanced"
    }

    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", LogisticRegression(**params))
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_pred_proba = cross_val_predict(clf, X_train, y_train, cv=skf, method="predict_proba")
    y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)  # seuil 0.5 ici (optimis√© plus tard)

    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)

    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric("cv_auc_mean", metrics["auc"])
    mlflow.log_metric("training_time", time.time() - start_time)

    clf.fit(X_train, y_train)
    mlflow.sklearn.log_model(clf, "model")

    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")


=== LOGISTIC REGRESSION ===




AUC: 0.7589
Accuracy: 0.6967
Business Cost: 162679
üèÉ View run logistic_regression_baseline at: http://localhost:5000/#/experiments/1/runs/a8cfeedaffb9468ca986830a6a282bbd
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 7. Entrainement Random Forest

In [7]:
print("\n=== RANDOM FOREST ===")
with mlflow.start_run(run_name="random_forest_baseline"):
    start_time = time.time()

    params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 10,
        'min_samples_leaf': 5,
        'random_state': 42,
        'class_weight': 'balanced',
        'n_jobs': -1
    }

    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", RandomForestClassifier(**params))
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_pred_proba = cross_val_predict(clf, X_train, y_train, cv=skf, method="predict_proba")
    y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)

    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)

    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric("cv_auc_mean", metrics["auc"])
    mlflow.log_metric("training_time", time.time() - start_time)

    clf.fit(X_train, y_train)
    mlflow.sklearn.log_model(clf, "model")

    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")


=== RANDOM FOREST ===




AUC: 0.7374
Accuracy: 0.7349
Business Cost: 170962
üèÉ View run random_forest_baseline at: http://localhost:5000/#/experiments/1/runs/1fe7154d35954c8888c71e1cbd2eab4e
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 8. Entrainement XGBoost

In [8]:
print("\n=== XGBOOST ===")
with mlflow.start_run(run_name="xgboost_baseline"):
    start_time = time.time()

    scale_pos = (y_train == 0).sum() / (y_train == 1).sum()

    params = {
        'n_estimators': 100,
        'max_depth': 5,
        'learning_rate': 0.1,
        'scale_pos_weight': scale_pos,
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False
    }

    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", xgb.XGBClassifier(**params))
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_pred_proba = cross_val_predict(clf, X_train, y_train, cv=skf, method="predict_proba")
    y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)

    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)

    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric("cv_auc_mean", metrics["auc"])
    mlflow.log_metric("training_time", time.time() - start_time)

    clf.fit(X_train, y_train)
    mlflow.sklearn.log_model(clf, "model")

    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")


=== XGBOOST ===




AUC: 0.7656
Accuracy: 0.7149
Business Cost: 160061
üèÉ View run xgboost_baseline at: http://localhost:5000/#/experiments/1/runs/38db1493f69f4dda9c1dbad4270cd54e
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 9. Entrainement LightGBM

In [9]:
print("\n=== LIGHTGBM ===")
with mlflow.start_run(run_name="lightgbm_baseline"):
    start_time = time.time()

    scale_pos = (y_train == 0).sum() / (y_train == 1).sum()

    params = {
        'n_estimators': 100,
        'max_depth': 5,
        'learning_rate': 0.1,
        'scale_pos_weight': scale_pos,
        'random_state': 42,
        'verbosity': -1
    }

    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", lgb.LGBMClassifier(**params))
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_pred_proba = cross_val_predict(clf, X_train, y_train, cv=skf, method="predict_proba")
    y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)

    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)

    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric("cv_auc_mean", metrics["auc"])
    mlflow.log_metric("training_time", time.time() - start_time)

    clf.fit(X_train, y_train)
    mlflow.sklearn.log_model(clf, "model")

    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")


=== LIGHTGBM ===




AUC: 0.7658
Accuracy: 0.7122
Business Cost: 159839
üèÉ View run lightgbm_baseline at: http://localhost:5000/#/experiments/1/runs/bdb32dee09c84058aed582d4139e5234
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 10. Recuperer et comparer tous les runs

In [10]:
print("\n=== COMPARAISON DES MODELES ===")

experiment = mlflow.get_experiment_by_name(experiment_name)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

print(f"Colonnes disponibles: {list(runs.columns)}")
print(f"Nombre de runs: {len(runs)}\n")

# Creer DataFrame de comparaison avec les bonnes colonnes
comparison = runs[['tags.mlflow.runName', 'metrics.auc', 'metrics.accuracy', 'metrics.business_cost', 'metrics.cv_auc_mean']].copy()
comparison.columns = ['Model', 'AUC', 'Accuracy', 'Business Cost', 'CV AUC Mean']
comparison = comparison.sort_values('CV AUC Mean', ascending=False)

print(comparison.to_string(index=False))


=== COMPARAISON DES MODELES ===
Colonnes disponibles: ['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time', 'end_time', 'metrics.f1', 'metrics.accuracy', 'metrics.business_cost', 'metrics.recall', 'metrics.auc', 'metrics.training_time', 'metrics.precision', 'metrics.tp', 'metrics.fn', 'metrics.tn', 'metrics.fp', 'metrics.cv_auc_mean', 'params.n_estimators', 'params.learning_rate', 'params.verbosity', 'params.scale_pos_weight', 'params.random_state', 'params.max_depth', 'params.use_label_encoder', 'params.class_weight', 'params.n_jobs', 'params.min_samples_leaf', 'params.min_samples_split', 'params.max_iter', 'params.solver', 'tags.mlflow.source.type', 'tags.mlflow.runName', 'tags.mlflow.user', 'tags.mlflow.source.name']
Nombre de runs: 4

                       Model      AUC  Accuracy  Business Cost  CV AUC Mean
           lightgbm_baseline 0.765800  0.712218       159839.0     0.765800
            xgboost_baseline 0.765555  0.714891       160061.0     0.765555
logistic

## 11. R√©sum√©: Entra√Ænement des mod√®les

In [11]:
print("\n" + "="*70)
print("R√âSUM√â: √âTAPE 2 - ENTRA√éNEMENT DES MOD√àLES")
print("="*70)

print("\n‚úÖ MOD√àLES ENTRA√éN√âS:")
print(f"\n1. LOGISTIC REGRESSION (BASELINE)")
print(f"   ‚Ä¢ Mod√®le de r√©gression logistique simple")
print(f"   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds")
print(f"   ‚Ä¢ Classe √©quilibr√©e avec class_weight='balanced'")

print(f"\n2. RANDOM FOREST (ENSEMBLE)")
print(f"   ‚Ä¢ 100 arbres, max_depth=10")
print(f"   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds")
print(f"   ‚Ä¢ Classe √©quilibr√©e avec class_weight='balanced'")

print(f"\n3. XGBOOST (GRADIENT BOOSTING)")
print(f"   ‚Ä¢ 100 estimateurs, max_depth=5")
print(f"   ‚Ä¢ Scale_pos_weight calcul√© pour d√©s√©quilibre")
print(f"   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds")

print(f"\n4. LIGHTGBM (GRADIENT BOOSTING AVANC√â)")
print(f"   ‚Ä¢ 100 estimateurs, max_depth=5")
print(f"   ‚Ä¢ Scale_pos_weight pour gestion du d√©s√©quilibre")
print(f"   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds")
print(f"   ‚Ä¢ Parall√©lis√© avec n_jobs=-1")

print(f"\nüìä M√âTRIQUES CALCUL√âES:")
print(f"   ‚Ä¢ Accuracy: Score de pr√©cision globale")
print(f"   ‚Ä¢ Precision: Parmi pr√©dictions positives, combien sont correctes")
print(f"   ‚Ä¢ Recall: Parmi vrais positifs, combien sont d√©tect√©s")
print(f"   ‚Ä¢ F1-Score: Moyenne harmonique Precision et Recall")
print(f"   ‚Ä¢ AUC-ROC: Aire sous la courbe ROC")
print(f"   ‚Ä¢ Business Cost: Co√ªt m√©tier (FN*10 + FP*1)")

print(f"\nüîÑ VALIDATION CROIS√âE:")
print(f"   ‚Ä¢ Type: StratifiedKFold (5-splits)")
print(f"   ‚Ä¢ Conservation de la distribution de classes")
print(f"   ‚Ä¢ √âvaluation robuste de la performance r√©elle")

print(f"\nüìà MLFLOW TRACKING:")
print(f"   ‚Ä¢ Tous les runs logg√©s dans MLFlow")
print(f"   ‚Ä¢ Exp√©rience: 'credit_scoring_v1'")
print(f"   ‚Ä¢ Hyperparam√®tres log√©s pour tra√ßabilit√©")
print(f"   ‚Ä¢ Acc√®s via: http://localhost:5000")

print("\n" + "="*70)
print("‚úÖ √âTAPE 2 COMPL√àT√âE - 4 MOD√àLES ENTRA√éN√âS ET COMPAR√âS")
print("="*70)


R√âSUM√â: √âTAPE 2 - ENTRA√éNEMENT DES MOD√àLES

‚úÖ MOD√àLES ENTRA√éN√âS:

1. LOGISTIC REGRESSION (BASELINE)
   ‚Ä¢ Mod√®le de r√©gression logistique simple
   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds
   ‚Ä¢ Classe √©quilibr√©e avec class_weight='balanced'

2. RANDOM FOREST (ENSEMBLE)
   ‚Ä¢ 100 arbres, max_depth=10
   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds
   ‚Ä¢ Classe √©quilibr√©e avec class_weight='balanced'

3. XGBOOST (GRADIENT BOOSTING)
   ‚Ä¢ 100 estimateurs, max_depth=5
   ‚Ä¢ Scale_pos_weight calcul√© pour d√©s√©quilibre
   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds

4. LIGHTGBM (GRADIENT BOOSTING AVANC√â)
   ‚Ä¢ 100 estimateurs, max_depth=5
   ‚Ä¢ Scale_pos_weight pour gestion du d√©s√©quilibre
   ‚Ä¢ Validation crois√©e stratifi√©e 5-folds
   ‚Ä¢ Parall√©lis√© avec n_jobs=-1

üìä M√âTRIQUES CALCUL√âES:
   ‚Ä¢ Accuracy: Score de pr√©cision globale
   ‚Ä¢ Precision: Parmi pr√©dictions positives, combien sont correctes
   ‚Ä¢ Recall: Parmi vrais positifs, combien s

## Prochaines etapes

Etape 3: Optimisation des hyperparametres (GridSearchCV/Optuna)  Etape 4: Optimisation du seuil de decision et feature importance