In [16]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
import mlflow.sklearn
import mlflow.lightgbm
import os
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from flaml import AutoML
import re

BASE_DIR = Path(r"C:/Users/coach/Desktop/datascientest/OpenClassrooms/Projects_MLops/Projet_1_initialisation_MLops")
DATA_RAW = BASE_DIR / "data/raw"
DATA_PROC = BASE_DIR / "data" / "proceed"
DATA_PATH = DATA_PROC / "homecredit_features.csv"
df = pd.read_csv(DATA_PATH, low_memory=False)
print(df.shape)
#print(df.head())

train_df = df[df["TARGET"].notna()]
train_autoML = train_df.drop(columns=["SK_ID_CURR"])
X = train_df.drop(columns=["TARGET", "SK_ID_CURR"])
X = X.replace([np.inf, -np.inf], np.nan)
imputer = SimpleImputer(strategy="median") #remplace les valeurs manquantes par la médiane de chaque colonne
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
y = train_df["TARGET"]
print(X.shape, y.shape)
print(y.value_counts(normalize=True)) #desequilibre des classes 0 (92%) et 1(8%)



(356251, 276)
(307507, 274) (307507,)
TARGET
0.0    0.91927
1.0    0.08073
Name: proportion, dtype: float64


In [None]:
#reduire la taille des donnees pour test rapide en conservant repartition des classes
X = X.sample(n=10000, random_state=42)
y = y.loc[X.index]
print(X.shape, y.shape)    

In [17]:
# metrique cout metier
def business_cost(y_true, y_pred, fn_cost=10, fp_cost=1):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fn * fn_cost + fp * fp_cost

# meilleur seuil
def find_best_threshold(y_true, y_proba):
    thresholds = np.arange(0.05, 0.95, 0.05)
    costs = []
    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        cost = business_cost(y_true, y_pred)
        costs.append(cost)
    best_idx = np.argmin(costs)
    return thresholds[best_idx], costs

# cross-validation stratifiée 5-fold
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [18]:
mlflow.set_tracking_uri("file:./mlruns") #Projet_1_initialisation_MLops/notebook/mlruns
mlflow.set_experiment("HomeCredit_Scoring")
#best parameters from tuning Best_params = {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}

best_params = {
    "colsample_bytree": 0.8,
    "learning_rate": 0.05,
    "max_depth": -1,
    "n_estimators": 200,
    "num_leaves": 31,
    "subsample": 0.8,
    "class_weight": "balanced",
    "random_state": 42,
    "n_jobs": -1
}

def clean_name(name):
    # keep letters, digits and underscore; replace others by underscore
    name = re.sub(r"[^0-9a-zA-Z_]", "_", str(name))
    # optional: collapse multiple underscores
    name = re.sub(r"_+", "_", name)
    return name

X = X.copy()
X.columns = [clean_name(c) for c in X.columns]

with mlflow.start_run(run_name="LightGBM_tuned"):
    # Log params
    mlflow.log_params(best_params)
    
    # Train model
    best_lgb = LGBMClassifier(**best_params)
    best_lgb.fit(X, y)
    model_info = mlflow.sklearn.log_model(sk_model=best_lgb, name="HomeCredit_LightGBM_Tuned")

    # Evaluate model
    y_proba = best_lgb.predict_proba(X)[:,1] #prediction avec le meilleur modele
    best_t, costs = find_best_threshold(y, y_proba) #meilleur seuil et cout
    best_cost = np.min(costs) #cout minimum
    auc = roc_auc_score(y, y_proba)
    acc = accuracy_score(y, best_lgb.predict(X))

    # Log metrics
    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Accuracy", acc)
    mlflow.log_metric("Business_Cost", best_cost)
    mlflow.log_metric("Threshold", best_t)

    # Tags 
    mlflow.set_tags({ "model": "LightGBM","project": "HomeCredit","stage": "tuned"})


print("Run saved in MLflow")

[LightGBM] [Info] Number of positive: 24825, number of negative: 282682
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.151311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16819
[LightGBM] [Info] Number of data points in the train set: 307507, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Run saved in MLflow
