## 1. Importations

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import warnings
import logging

# Masquer les warnings MLflow li√©s √† l'environnement
warnings.filterwarnings("ignore", message=".*Failed to resolve installed pip version.*")
logging.getLogger("mlflow.utils.environment").setLevel(logging.ERROR)

print(f"MLflow Version: {mlflow.__version__}")
mlflow.set_tracking_uri("http://localhost:5001")  # URL du serveur MLflow

MLflow Version: 3.4.0


## 2. Chargement des donn√©es

In [19]:
DATA_PATH = '../datas/02_preprocess/datas.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print(f"Donn√©es charg√©es. Forme: {df.shape}")
except FileNotFoundError:
    print(f"Erreur: Le fichier {DATA_PATH} n'a pas √©t√© trouv√©. V√©rifier le chemin.")

# S√©paration des features (X) et de la cible (y)
if 'TARGET' in df.columns:
    X = df.drop('TARGET', axis=1)
    y = df['TARGET']
    print(f"X shape: {X.shape}, y shape: {y.shape}")
else:
    print("Erreur: La colonne 'TARGET' n'a pas √©t√© trouv√©e. V√©rifier le nom de la colonne cible.")

Donn√©es charg√©es. Forme: (307507, 139)
X shape: (307507, 138), y shape: (307507,)


## 3. S√©paration des donn√©es en Train / Test

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
    )

print(f"Taille de l'ensemble d'entra√Ænement : {X_train.shape[0]} √©chantillons")
print(f"Taille de l'ensemble de test : {X_test.shape[0]} √©chantillons")
print(f"Proportion de d√©parts dans y_train : {y_train.mean():.2%}")
print(f"Proportion de d√©parts dans y_test : {y_test.mean():.2%}")

Taille de l'ensemble d'entra√Ænement : 246005 √©chantillons
Taille de l'ensemble de test : 61502 √©chantillons
Proportion de d√©parts dans y_train : 8.07%
Proportion de d√©parts dans y_test : 8.07%


## 4. Configuration MLflow

# Suivi d'exp√©rimentations avec MLflow

Objectifs:
- Configurer MLflow (URI, exp√©rience).
- Lancer un run de base et logger param√®tres/m√©triques/artefacts.
- Comprendre la navigation dans l'UI et l'organisation des runs.

Contenu:
1. Chargement des donn√©es et split train/test.
2. Baseline avec un mod√®le simple et m√©triques standard.
3. Tracking MLflow:
   - mlflow.set_experiment, start_run
   - log_params, log_metrics, log_artifact
4. Conseils:
   - Nommage des runs (run_name)
   - Utilisation de tags pour filtrer (ex: stage, data_split)
   - Enregistrement d'un mod√®le dans le registry si pertinent

Bonnes pratiques:
- Toujours fixer un random_state pour la reproductibilit√©.
- Logger les versions critiques (librairies, donn√©es).

In [21]:
experiment_name = "Credit_Scoring_Baseline"
mlflow.set_experiment(experiment_name)
print(f"MLflow Experiment set to: {experiment_name}")

2025/10/02 15:24:57 INFO mlflow.tracking.fluent: Experiment with name 'Credit_Scoring_Baseline' does not exist. Creating a new experiment.


MLflow Experiment set to: Credit_Scoring_Baseline


## 5. Premi√®re Exp√©rimentation MLflow avec un mod√®le de r√©gression logistique

In [22]:
# Activer l'autologging pour sklearn (si d√©sir√©)
# mlflow.sklearn.autolog() # Peut √™tre appel√© ici ou avant le start_run si on veut englober tout.

In [23]:
with mlflow.start_run(run_name="Logistic_Regression_Baseline_Run_1") as run:
    # R√©cup√©rer l'ID du run pour r√©f√©rence
    run_id = run.info.run_id
    print(f"MLflow Run ID: {run_id}")

    # Param√®tres du mod√®le
    solver = 'lbfgs'
    max_iter = 3000
    C = 0.1
    random_state = 42

    # Log des param√®tres
    mlflow.log_param("solver", solver)
    mlflow.log_param("max_iter", max_iter)
    mlflow.log_param("C", C)
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("model_type", "Logistic Regression")

    # Initialisation et entra√Ænement du mod√®le
    #model = LogisticRegression(solver=solver, max_iter=max_iter, C=C, random_state=random_state, class_weight='balanced')
    model = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver=solver,
            max_iter=max_iter,
            C=C,
            class_weight="balanced",
            random_state=random_state
        ))
    ])
    model.fit(X_train, y_train)

    # Pr√©dictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calcul des m√©triques
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred) # Rappel sur la classe positive (par d√©faut)

    # Log des m√©triques
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall_score", recall)

    print(f"Metrics for Logistic Regression (Run ID: {run_id}):")
    print(f"  ROC AUC: {roc_auc:.2f}")
    print(f"  F1-Score: {f1:.2f}")
    print(f"  Recall: {recall:.2f}")

    # Enregistrement du mod√®le
    # mlflow.sklearn.log_model(model, "logistic_regression_model")
    # Pour une meilleure gestion des artefacts et versioning:
    # mlflow.log_artifact("02_MLflow_Basic_Tracking.ipynb") # Enregistre le script actuel (utile pour la reproductibilit√©)
    mlflow.sklearn.log_model(
        sk_model=model,
        name="logistic_regression_model",
        input_example=X_train.iloc[:5],  # petit √©chantillon d‚Äôentr√©e
        registered_model_name="LogisticRegressionModel_CreditScoring", # Enregistre le mod√®le dans le Model Registry
        signature=infer_signature(X_train, model.predict(X_train))  # sch√©ma I/O
    )

    # Ajout de tags
    mlflow.set_tag("stage", "baseline")
    mlflow.set_tag("data_source", "Home_Credit_Kaggle")
    mlflow.set_tag("author", "Christopher")
    mlflow.set_tag("comments", "Idem que Run_1 et Run_2 mais suppression des corr√©lations > 0.7 avant split.")

    print("Experiment termin√© et logg√© dans MLflow UI.")

MLflow Run ID: d4d2e8d9d54f4724b3d3ffe2c559bb69
Metrics for Logistic Regression (Run ID: d4d2e8d9d54f4724b3d3ffe2c559bb69):
  ROC AUC: 0.74
  F1-Score: 0.26
  Recall: 0.67


Successfully registered model 'LogisticRegressionModel_CreditScoring'.
2025/10/02 15:25:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionModel_CreditScoring, version 1


Experiment termin√© et logg√© dans MLflow UI.
üèÉ View run Logistic_Regression_Baseline_Run_1 at: http://localhost:5000/#/experiments/337906899368494804/runs/d4d2e8d9d54f4724b3d3ffe2c559bb69
üß™ View experiment at: http://localhost:5000/#/experiments/337906899368494804


Created version '1' of model 'LogisticRegressionModel_CreditScoring'.
