Import framework et data

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
#import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

#BASE_DIR = Path().resolve().parents[1]
BASE_DIR = Path(r"C:/Users/coach/Desktop/datascientest/OpenClassrooms/Projects_MLops/Projet_1_initialisation_MLops")
DATA_RAW = BASE_DIR / "data/raw"
#DATA_PROC = BASE_DIR / "data/proceed"
DATA_PROC = BASE_DIR / "data" / "proceed"
DATA_PATH = DATA_PROC / "homecredit_features.csv"
df = pd.read_csv(DATA_PATH, low_memory=False)
print(df.shape)
df.head()

train_df = df[df["TARGET"].notna()]
X = train_df.drop(columns=["TARGET", "SK_ID_CURR"])
y = train_df["TARGET"]
print(X.shape, y.shape)
print(y.value_counts(normalize=True))

(356251, 276)
(307507, 274) (307507,)
TARGET
0.0    0.91927
1.0    0.08073
Name: proportion, dtype: float64


2/ Metriques & business optimization

In [8]:
# metrique cout metier
def business_cost(y_true, y_pred, fn_cost=10, fp_cost=1):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fn * fn_cost + fp * fp_cost

# meilleur seuil
def find_best_threshold(y_true, y_proba):
    thresholds = np.arange(0.05, 0.95, 0.05)
    costs = []
    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        cost = business_cost(y_true, y_pred)
        costs.append(cost)
    best_idx = np.argmin(costs)
    return thresholds[best_idx], costs


3/ Modeles IA

In [9]:

# calcul rapport des classes pour XGBoost
neg = (y == 0).sum()
pos = (y == 1).sum()
scale_pos_weight = neg / pos

# pipelines des modèles
models = {
    "Logistic": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(100,50),
        max_iter=300
    ),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )}
   # ,
   # "LightGBM": lgb.LGBMClassifier(
   #     n_estimators=300,
   #     class_weight="balanced"
   # )
#}

# cross-validation stratifiée 5-fold
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

3/ Training des modèles

In [None]:
results = []
for name, model in models.items():
    print(f"\nTraining {name}")
    aucs = []
    costs = []
    thresholds = []
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f" Fold {fold+1}")
        X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        X_train = X_train.fillna(-999).dtype(np.float32)
        X_val = X_val.fillna(-999).dtype(np.float32)
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_proba)
        best_t, _ = find_best_threshold(y_val, y_proba)
        y_pred = (y_proba >= best_t).astype(int)
        cost = business_cost(y_val, y_pred)
        aucs.append(auc)
        costs.append(cost)
        thresholds.append(best_t)
    
    results.append({
        "model": name,
        "mean_auc": np.mean(aucs),
        "mean_cost": np.mean(costs),
        "mean_threshold": np.mean(thresholds)
    })

#stockage des résultats dans un DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values("mean_cost", inplace=True)
print(results_df.sort_values("mean_cost"))

#visualisation des résultats
plt.figure(figsize=(8,5))
sns.barplot(data=results_df,x="model",y="mean_cost")
plt.title("Business Cost by Model")
plt.show()


Training Logistic
 Fold 1


ValueError: Input X contains infinity or a value too large for dtype('float64').