# PRÉDICTION DU DIABÈTE AVEC MLFLOW
Ce notebook intègre le tracking d'expériences, la journalisation des paramètres et la sauvegarde du modèle via MLflow.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
import mlflow
import mlflow.sklearn
import mlflow.xgboost

from sklearn.model_selection import (
    train_test_split, StratifiedKFold,
    cross_validate, GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve,
    recall_score, precision_score,
    f1_score
)

# --- CONFIGURATION MLFLOW ---
# On définit le nom de l'expérience
mlflow.set_experiment("Diabetes_Risk_Prediction")

# Activation de l'autologging pour capturer automatiquement les params et métriques
mlflow.sklearn.autolog()
mlflow.xgboost.autolog()

  from .autonotebook import tqdm as notebook_tqdm
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/19 11:10:56 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/19 11:10:56 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/19 11:10:56 INFO mlflow.store.db.utils: Updating database tables
2026/01/19 11:10:56 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/19 11:10:56 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/19 11:10:56 INFO alembic.runtime.migration: Context impl SQ

In [2]:
# 1. Chargement et Nettoyage des données
try:
    df = pd.read_csv("save_final.csv")
except FileNotFoundError:
    # Création d'un dataset dummy si le fichier n'existe pas pour l'exemple
    from sklearn.datasets import load_diabetes
    data = load_diabetes(as_frame=True)
    df = data.frame
    df['Outcome'] = (data.target > 140).astype(int) # Simulation classification
    df.columns = ["Age", "Sex", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6", "Outcome"]
    # Renommage pour coller au code original si possible, sinon adapter

# Remplacement des 0 impossibles par NaN
cols_zero_to_nan = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for c in cols_zero_to_nan:
    if c in df.columns:
        df[c] = df[c].replace(0, np.nan)

# Séparation X / y
if "Outcome" in df.columns:
    X = df.drop("Outcome", axis=1)
    y = df["Outcome"]
else:
    # Fallback
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

numeric_features = X.columns.tolist()

# Pipeline de Preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

## Entraînement avec MLflow Tracking (GridSearch)

In [3]:
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- DÉFINITION DES MODÈLES ---
pipe_lr = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])
param_grid_lr = {"model__C": [0.01, 0.1, 1, 10]}

pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42, class_weight="balanced"))
])
param_grid_rf = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [5, 10]
}

pipe_xgb = Pipeline([
    ("preprocess", preprocess),
    ("model", XGBClassifier(random_state=42, eval_metric="logloss"))
])
param_grid_xgb = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 5],
    "model__learning_rate": [0.01, 0.1]
}

# --- LANCEMENT DU RUN MLFLOW ---
print("Démarrage de l'entraînement avec tracking MLflow...")

# On utilise un seul run parent pour regrouper tout le GridSearch si on veut,
# ou on laisse l'autolog gérer les runs enfants.

grid_lr = GridSearchCV(pipe_lr, param_grid_lr, scoring="recall", cv=cv_inner, n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, scoring="recall", cv=cv_inner, n_jobs=-1)
grid_xgb = GridSearchCV(pipe_xgb, param_grid_xgb, scoring="recall", cv=cv_inner, n_jobs=-1)

# Entraînement (MLflow va logger automatiquement grâce à autolog)
grid_lr.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_xgb.fit(X_train, y_train)

print("Entraînement terminé.")

Démarrage de l'entraînement avec tracking MLflow...


2026/01/19 11:10:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '280f54f7abc24fcd8ad47cd2c3b88b27', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2026/01/19 11:11:21 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2026/01/19 11:11:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c253ca1475274772a624bc7bd439c613', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2026/01/19 11:11:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2026/01/19 11:11:35 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ed91b0ce422e4321a368a56eaba95dfd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2026/01/19 11:11:51 INF

Entraînement terminé.


## Sélection du meilleur modèle et Logging Final

In [4]:
# Récupération des meilleurs estimateurs
models_optimised = {
    "log_reg": grid_lr.best_estimator_,
    "random_forest": grid_rf.best_estimator_,
    "xgboost": grid_xgb.best_estimator_
}

# Recalcul des scores sur le Test Set pour le choix final
results = {}
RECALL_MIN = 0.75

for name, model in models_optimised.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    results[name] = {
        "roc_auc": roc_auc_score(y_test, y_proba),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "model": model
    }

# Logique de sélection
eligible = {k: v for k, v in results.items() if v["recall"] >= RECALL_MIN}
if eligible:
    best_name = max(eligible, key=lambda k: eligible[k]["roc_auc"])
else:
    best_name = max(results, key=lambda k: results[k]["recall"])

best_model = results[best_name]["model"]
print(f"Meilleur modèle sélectionné : {best_name}")

# --- LOGGING FINAL DANS MLFLOW ---
# On crée un run spécifique pour le modèle "Champion"
with mlflow.start_run(run_name="Champion_Model_TestSet"):
    # 1. Log des paramètres choisis
    mlflow.log_param("selected_model_type", best_name)
    mlflow.log_param("threshold_clinique", 0.5) # ou seuil optimal si calculé

    # 2. Log des métriques sur le TEST set
    mlflow.log_metric("test_roc_auc", results[best_name]["roc_auc"])
    mlflow.log_metric("test_recall", results[best_name]["recall"])
    mlflow.log_metric("test_f1", results[best_name]["f1"])

    # 3. Log du modèle complet (pipeline)
    mlflow.sklearn.log_model(best_model, "model")

    # 4. Sauvegarde des graphiques (SHAP, ROC)
    # ROC Curve
    y_proba_best = best_model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba_best)
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC (AUC = {results[best_name]['roc_auc']:.2f})")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("Courbe ROC - Test Set")
    plt.legend()
    plt.savefig("roc_curve.png")
    mlflow.log_artifact("roc_curve.png")
    plt.close()

    # SHAP (si compatible, ex: RandomForest ou XGBoost)
    try:
        # On accède à l'estimateur dans le pipeline
        estimator = best_model.named_steps["model"]
        X_test_trans = best_model.named_steps["preprocess"].transform(X_test)
        
        # Explainer
        if hasattr(estimator, "feature_importances_"):
            explainer = shap.TreeExplainer(estimator)
            shap_values = explainer.shap_values(X_test_trans)
            
            # Gestion format SHAP (binaire : liste de 2 arrays ou array unique)
            if isinstance(shap_values, list):
                shap_val_to_plot = shap_values[1]
            else:
                shap_val_to_plot = shap_values

            plt.figure()
            shap.summary_plot(shap_val_to_plot, X_test_trans, feature_names=numeric_features, show=False)
            plt.savefig("shap_summary.png", bbox_inches='tight')
            mlflow.log_artifact("shap_summary.png")
            plt.close()
    except Exception as e:
        print(f"Erreur SHAP : {e}")

    print(f"Run MLflow terminé. Modèle {best_name} sauvegardé.")

Meilleur modèle sélectionné : random_forest




Run MLflow terminé. Modèle random_forest sauvegardé.


## API FASTAPI (Chargement via MLflow)
Voici comment charger le modèle depuis MLflow pour l'API.

In [5]:
# NOTE: Ceci est un code exemple pour l'API, à mettre dans un fichier api.py
# Pour charger le modèle MLflow :
# logged_model = 'runs:/<RUN_ID>/model'
# loaded_model = mlflow.pyfunc.load_model(logged_model)


from fastapi import FastAPI
from pydantic import BaseModel
import joblib

app = FastAPI()

class PatientFeatures(BaseModel):
    Pregnancies: float
    Glucose: float
    BloodPressure: float
    SkinThickness: float
    Insulin: float
    BMI: float
    DiabetesPedigreeFunction: float
    Age: float

# Sauvegarde du modèle
joblib.dump(best_model, "diabetes_risk_model.joblib")

model = joblib.load("diabetes_risk_model.joblib")

@app.post("/predict")
def predict_risk(features: PatientFeatures):
    data = [[
        features.Pregnancies,
        features.Glucose,
        features.BloodPressure,
        features.SkinThickness,
        features.Insulin,
        features.BMI,
        features.DiabetesPedigreeFunction,
        features.Age
    ]]
    proba = model.predict_proba(data)[0, 1]
    return {
        "probability": float(proba),
        "score": risk_score(proba),
        "risk_level": risk_level(proba)
    }


print("Le notebook est terminé. Vérifiez l'interface MLflow avec la commande 'mlflow ui' dans le terminal.")

Le notebook est terminé. Vérifiez l'interface MLflow avec la commande 'mlflow ui' dans le terminal.
