# Etape 2 - MLflow Tracking & Entrainement des Modeles

Objectif: Entra√Æner et comparer plusieurs mod√®les avec tracking MLflow pour l'exp√©rimentation.

## 1. Import et configuration

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from pathlib import Path
import warnings
import json
import time

from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, 
    recall_score, f1_score, confusion_matrix
)
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

# Paths
ROOT_DIR = Path('.').resolve().parent
DATA_DIR = ROOT_DIR / 'outputs'
MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print(f"Root: {ROOT_DIR}")
print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")

Root: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps
Data: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\outputs
Models: C:\Users\daniel.guedj_arondor\Downloads\perso\openclassrooms\MLOps\models


## 2. Charger les donnees

In [2]:
print("Chargement des donnees...")
train = pd.read_csv(DATA_DIR / 'train_processed.csv')
test = pd.read_csv(DATA_DIR / 'test_processed.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Verifier TARGET
print(f"\nDistribution TARGET:")
print(train['TARGET'].value_counts())

Chargement des donnees...
Train shape: (307511, 148)
Test shape: (48744, 121)

Distribution TARGET:
TARGET
0    282686
1     24825
Name: count, dtype: int64


## 3. Preparer les donnees

In [3]:
# Separer X et y
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']

# Test: on suppose qu'il n'a pas TARGET (submission format)
if 'TARGET' in test.columns:
    X_test = test.drop('TARGET', axis=1)
    y_test = test['TARGET']
else:
    X_test = test.copy()
    y_test = None

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

# Garder les ids pour plus tard (si present)
if 'SK_ID_CURR' in X_test.columns:
    test_ids = X_test['SK_ID_CURR'].copy()
    X_train = X_train.drop('SK_ID_CURR', axis=1, errors='ignore')
    X_test = X_test.drop('SK_ID_CURR', axis=1)

X_train shape: (307511, 147)
y_train shape: (307511,)
X_test shape: (48744, 121)


## 4. Configuration MLflow

In [4]:
# Configurer MLflow
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name = 'credit_scoring_v1'

# Creer experiment si elle n'existe pas
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment: {experiment_name}")
print(f"Experiment ID: {experiment_id}")

Experiment: credit_scoring_v1
Experiment ID: 1


## 5. Fonction de metriques

In [6]:
def calculate_metrics(y_true, y_pred, y_pred_proba=None, cost_fn=10, cost_fp=1):
    """
    Calcule les metriques de classification.
    
    Args:
        y_true: Labels vrais
        y_pred: Predictions (0/1)
        y_pred_proba: Probabilites (pour AUC)
        cost_fn: Cout d'un Faux Negatif (defaut: 10)
        cost_fp: Cout d'un Faux Positif (defaut: 1)
    """
    metrics = {}
    
    # Metriques standards
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    
    # AUC (si probabilites disponibles)
    if y_pred_proba is not None:
        metrics['auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    # Co√ªt m√©tier
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['business_cost'] = fn * cost_fn + fp * cost_fp
    metrics['tn'] = int(tn)
    metrics['fp'] = int(fp)
    metrics['fn'] = int(fn)
    metrics['tp'] = int(tp)
    
    return metrics

print("Fonction de metriques: OK")

Fonction de metriques: OK


## 6. Entrainement Logistic Regression

In [7]:
print("\n=== LOGISTIC REGRESSION ===")
with mlflow.start_run(run_name="logistic_regression_baseline"):
    start_time = time.time()
    
    # Parametres
    params = {
        'max_iter': 1000,
        'random_state': 42,
        'solver': 'lbfgs',
        'class_weight': 'balanced'
    }
    
    # Scaler + Model
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    model = LogisticRegression(**params)
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = cross_validate(
        model, X_train_scaled, y_train, 
        cv=skf, 
        scoring=['roc_auc', 'accuracy']
    )
    
    # Predictions
    y_pred_proba = cross_val_predict(model, X_train_scaled, y_train, cv=skf, method='predict_proba')
    y_pred = cross_val_predict(model, X_train_scaled, y_train, cv=skf)
    
    # Metriques
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    # Log
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric('cv_auc_mean', cv_scores['test_roc_auc'].mean())
    mlflow.log_metric('cv_accuracy_mean', cv_scores['test_accuracy'].mean())
    mlflow.log_metric('training_time', time.time() - start_time)
    
    # Save model
    model.fit(X_train_scaled, y_train)
    mlflow.sklearn.log_model(model, 'model')
    
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"CV AUC Mean: {cv_scores['test_roc_auc'].mean():.4f}")


=== LOGISTIC REGRESSION ===




AUC: 0.7573
Accuracy: 0.6950
Business Cost: 163281
CV AUC Mean: 0.7573
üèÉ View run logistic_regression_baseline at: http://localhost:5000/#/experiments/1/runs/cf6807d7429949c882f5fa41252a97b1
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 7. Entrainement Random Forest

In [8]:
print("\n=== RANDOM FOREST ===")
with mlflow.start_run(run_name="random_forest_baseline"):
    start_time = time.time()
    
    params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 10,
        'min_samples_leaf': 5,
        'random_state': 42,
        'class_weight': 'balanced',
        'n_jobs': -1
    }
    
    model = RandomForestClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = cross_validate(
        model, X_train, y_train, 
        cv=skf, 
        scoring=['roc_auc', 'accuracy']
    )
    
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=skf, method='predict_proba')
    y_pred = cross_val_predict(model, X_train, y_train, cv=skf)
    
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_metric('cv_auc_mean', cv_scores['test_roc_auc'].mean())
    mlflow.log_metric('cv_accuracy_mean', cv_scores['test_accuracy'].mean())
    mlflow.log_metric('training_time', time.time() - start_time)
    
    model.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, 'model')
    
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"CV AUC Mean: {cv_scores['test_roc_auc'].mean():.4f}")


=== RANDOM FOREST ===




AUC: 0.7413
Accuracy: 0.7382
Business Cost: 169766
CV AUC Mean: 0.7414
üèÉ View run random_forest_baseline at: http://localhost:5000/#/experiments/1/runs/15801d75ef9b4310a1c487cc979e5653
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 8. Entrainement XGBoost

In [9]:
print("\n=== XGBOOST ===")
with mlflow.start_run(run_name="xgboost_baseline"):
    start_time = time.time()
    
    # Calculer scale_pos_weight pour le desequilibre
    scale_pos = (y_train == 0).sum() / (y_train == 1).sum()
    
    params = {
        'n_estimators': 100,
        'max_depth': 5,
        'learning_rate': 0.1,
        'scale_pos_weight': scale_pos,
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False
    }
    
    model = xgb.XGBClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = cross_validate(
        model, X_train, y_train, 
        cv=skf, 
        scoring=['roc_auc', 'accuracy']
    )
    
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=skf, method='predict_proba')
    y_pred = cross_val_predict(model, X_train, y_train, cv=skf)
    
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    mlflow.log_params({k: v for k, v in params.items() if not isinstance(v, float)})
    mlflow.log_metrics(metrics)
    mlflow.log_metric('cv_auc_mean', cv_scores['test_roc_auc'].mean())
    mlflow.log_metric('cv_accuracy_mean', cv_scores['test_accuracy'].mean())
    mlflow.log_metric('training_time', time.time() - start_time)
    
    model.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, 'model')
    
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"CV AUC Mean: {cv_scores['test_roc_auc'].mean():.4f}")


=== XGBOOST ===




AUC: 0.7655
Accuracy: 0.7153
Business Cost: 159658
CV AUC Mean: 0.7655
üèÉ View run xgboost_baseline at: http://localhost:5000/#/experiments/1/runs/c46a360d1e1d48bd8e78883f1ce6c555
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 9. Entrainement LightGBM

In [10]:
print("\n=== LIGHTGBM ===")
with mlflow.start_run(run_name="lightgbm_baseline"):
    start_time = time.time()
    
    scale_pos = (y_train == 0).sum() / (y_train == 1).sum()
    
    params = {
        'n_estimators': 100,
        'max_depth': 5,
        'learning_rate': 0.1,
        'scale_pos_weight': scale_pos,
        'random_state': 42,
        'verbosity': -1
    }
    
    model = lgb.LGBMClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = cross_validate(
        model, X_train, y_train, 
        cv=skf, 
        scoring=['roc_auc', 'accuracy']
    )
    
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=skf, method='predict_proba')
    y_pred = cross_val_predict(model, X_train, y_train, cv=skf)
    
    metrics = calculate_metrics(y_train, y_pred, y_pred_proba)
    
    mlflow.log_params({k: v for k, v in params.items() if not isinstance(v, float)})
    mlflow.log_metrics(metrics)
    mlflow.log_metric('cv_auc_mean', cv_scores['test_roc_auc'].mean())
    mlflow.log_metric('cv_accuracy_mean', cv_scores['test_accuracy'].mean())
    mlflow.log_metric('training_time', time.time() - start_time)
    
    model.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, 'model')
    
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Business Cost: {metrics['business_cost']:.0f}")
    print(f"CV AUC Mean: {cv_scores['test_roc_auc'].mean():.4f}")


=== LIGHTGBM ===




AUC: 0.7657
Accuracy: 0.7132
Business Cost: 159859
CV AUC Mean: 0.7657
üèÉ View run lightgbm_baseline at: http://localhost:5000/#/experiments/1/runs/78d4070744eb4a0591b788a6904bf563
üß™ View experiment at: http://localhost:5000/#/experiments/1


## 10. Recuperer et comparer tous les runs

In [12]:
print("\n=== COMPARAISON DES MODELES ===")

experiment = mlflow.get_experiment_by_name(experiment_name)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

print(f"Colonnes disponibles: {list(runs.columns)}")
print(f"Nombre de runs: {len(runs)}\n")

# Creer DataFrame de comparaison avec les bonnes colonnes
comparison = runs[['tags.mlflow.runName', 'metrics.auc', 'metrics.accuracy', 'metrics.business_cost', 'metrics.cv_auc_mean']].copy()
comparison.columns = ['Model', 'AUC', 'Accuracy', 'Business Cost', 'CV AUC Mean']
comparison = comparison.sort_values('CV AUC Mean', ascending=False)

print(comparison.to_string(index=False))


=== COMPARAISON DES MODELES ===
Colonnes disponibles: ['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time', 'end_time', 'metrics.auc', 'metrics.fn', 'metrics.cv_auc_mean', 'metrics.recall', 'metrics.accuracy', 'metrics.tp', 'metrics.f1', 'metrics.tn', 'metrics.fp', 'metrics.precision', 'metrics.training_time', 'metrics.cv_accuracy_mean', 'metrics.business_cost', 'params.n_estimators', 'params.max_depth', 'params.random_state', 'params.verbosity', 'params.use_label_encoder', 'params.min_samples_split', 'params.class_weight', 'params.n_jobs', 'params.min_samples_leaf', 'params.max_iter', 'params.solver', 'tags.mlflow.runName', 'tags.mlflow.user', 'tags.mlflow.source.type', 'tags.mlflow.source.name']
Nombre de runs: 4

                       Model      AUC  Accuracy  Business Cost  CV AUC Mean
           lightgbm_baseline 0.765671  0.713236       159859.0     0.765709
            xgboost_baseline 0.765458  0.715265       159658.0     0.765482
logistic_regression_baseline 0.

## Prochaines etapes

Etape 3: Optimisation des hyperparametres (GridSearchCV/Optuna)  Etape 4: Optimisation du seuil de decision et feature importance