1/ import framework, data, modeles

In [8]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import (
    roc_auc_score,
    confusion_matrix,
    make_scorer
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

#BASE_DIR = Path().resolve().parents[2]
BASE_DIR = Path(r"C:/Users/coach/Desktop/datascientest/OpenClassrooms/Projects_MLops/Projet_1_initialisation_MLops")
DATA_PROC = BASE_DIR / "data" / "proceed"
DATA_PATH = DATA_PROC / "homecredit_features.csv"
df = pd.read_csv(DATA_PATH)
train_df = df[df["TARGET"].notna()]
X = train_df.drop(columns=["TARGET", "SK_ID_CURR"])
y = train_df["TARGET"]
num_cols = X.select_dtypes(include=["int32","int64","float32","float64"]).columns

def business_cost(y_true, y_pred, fn_cost=10, fp_cost=1):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return -(fn*fn_cost + fp*fp_cost)

business_scorer = make_scorer(business_cost)
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

def find_best_threshold(y_true, y_proba):
    thresholds = np.arange(0.05,0.95,0.02)
    costs = []
    for t in thresholds:
        y_pred = (y_proba>=t).astype(int)
        cost = business_cost(y_true,y_pred)
        costs.append(cost)
    best = np.argmax(costs)
    return thresholds[best], costs

ModuleNotFoundError: No module named 'lightgbm'

3/ Modeles et training

In [7]:
models = {

    "Logistic": {
        "model": LogisticRegression(
            class_weight="balanced",
            max_iter=2000
        ),
        "params": {
            "model__C": [0.01,0.1,1,10]
        }
    },

    "RandomForest": {
        "model": RandomForestClassifier(
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ),
        "params": {
            "model__n_estimators": [200,400],
            "model__max_depth": [8,12]
        }
    },

    "XGBoost": {
        "model": xgb.XGBClassifier(
            eval_metric="auc",
            n_jobs=-1,
            random_state=42
        ),
        "params": {
            "model__n_estimators": [200,400],
            "model__max_depth": [4,6],
            "model__learning_rate": [0.05,0.1]
        }
    },

    "LightGBM": {
        "model": lgb.LGBMClassifier(
            class_weight="balanced",
            random_state=42
        ),
        "params": {
            "model__n_estimators": [200,400],
            "model__num_leaves": [31,63],
            "model__learning_rate": [0.05,0.1]
        }
    }
}

results = []
best_models = {}

for name, cfg in models.items():
    print(f"\nOptimizing {name}")
    pipe = Pipeline([
        ("prep", prep),
        ("model", cfg["model"])
    ])
    grid = GridSearchCV(
        pipe,
        cfg["params"],
        scoring=business_scorer,
        cv=cv,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X, y)
    best_models[name] = grid.best_estimator_
    y_proba = grid.best_estimator_.predict_proba(X)[:,1]
    auc = roc_auc_score(y, y_proba)
    results.append({
        "model": name,
        "best_params": grid.best_params_,
        "business_score": grid.best_score_,
        "auc": auc
    })

results_df = pd.DataFrame(results)
results_df.sort_values("business_score", ascending=False)

final_results = []
for name, model in best_models.items():
    y_proba = model.predict_proba(X)[:,1]
    t, costs = find_best_threshold(y, y_proba)
    final_results.append({
        "model": name,
        "best_threshold": t,
        "min_cost": max(costs)
    })
pd.DataFrame(final_results)


NameError: name 'lgb' is not defined