In [1]:
#import des bibliothèques 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import optuna
import mlflow
import joblib
import time
import matplotlib.pyplot as plt
import logging
from sklearn.model_selection import train_test_split

# Chargement des données depuis `df_clean_imputed`
df_clean_imputed = pd.read_csv('df_clean_imputed.csv')  # Adapter avec le chemin correct

# Séparer les features (X) et la cible (y)
X = df_clean_imputed.drop(columns=['TARGET'])  
y = df_clean_imputed['TARGET']

# Première division : 1% pour l'entraînement et validation (X_train_eval, y_train_eval) et 99% pour le reste
X_train_eval, X_remaining, y_train_eval, y_remaining = train_test_split(
    X, y,
    train_size=0.01,
    stratify=y,
    random_state=42
)

# Deuxième division : 1% pour l'API (X_api, y_api) et 98% pour le jeu final
X_api, X_final, y_api, y_final = train_test_split(
    X_remaining, y_remaining,
    train_size=0.01 / 0.99,  # Calculé pour obtenir 1% de l'original sur les données restantes
    stratify=y_remaining,
    random_state=42
)

# Vérification des dimensions pour confirmation
print("Taille de X_train_eval:", X_train_eval.shape)
print("Taille de y_train_eval:", y_train_eval.shape)
print("Taille de X_api:", X_api.shape)
print("Taille de y_api:", y_api.shape)
print("Taille de X_final:", X_final.shape)
print("Taille de y_final:", y_final.shape)

# X_train_eval et y_train_eval : Utilisés pour l'entraînement et la validation des modèles
# X_api et y_api : Échantillon pour tester l'API
# X_final et y_final : Jeu de données final pour évaluation


  from .autonotebook import tqdm as notebook_tqdm


Taille de X_train_eval: (3075, 625)
Taille de y_train_eval: (3075,)
Taille de X_api: (3075, 625)
Taille de y_api: (3075,)
Taille de X_final: (301357, 625)
Taille de y_final: (301357,)


In [2]:
X_train_eval.to_csv('X_train_eval2.csv', index=False)
y_train_eval.to_csv('y_train_eval2.csv', index=False)
X_api.to_csv('X_api.csv', index=False)
y_api.to_csv('y_api.csv', index=False)
X_final.to_csv('X_final2.csv', index=False)
y_final.to_csv('y_final2.csv', index=False)

In [3]:
# Standardisation des données
scaler = StandardScaler()
X_train_eval_scaled = scaler.fit_transform(X_train_eval)
X_final_scaled = scaler.transform(X_final)

# Enregistrement du scaler pour une utilisation future via API
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [None]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import optuna
import mlflow
import time
import logging

# Configuration initiale
logging.getLogger('optuna').setLevel(logging.CRITICAL)
mlflow.set_experiment("Comparaison_de_Modèles")
start_time = time.time()
results = []
nb_runs = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smote = SMOTE()

# Chargement des données
X_train_eval = pd.read_csv('X_train_eval.csv')
y_train_eval = pd.read_csv('y_train_eval.csv').values.ravel()
X_final = pd.read_csv('X_final.csv')
y_final = pd.read_csv('y_final.csv').values.ravel()

# Fonction pour le calcul du score de coût métier
def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp + 10 * fn

# Optimisation du seuil
def optimize_threshold(y_true, y_prob):
    thresholds = np.linspace(0.1, 0.9, 100)
    best_threshold, best_score = 0.5, float('inf')
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        score = cost_metric(y_true, y_pred)
        if score < best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# Fonction d'optimisation pour Logistic Regression
def logistic_regression_optimization(trial):
    with mlflow.start_run(run_name="Optimisation_Régression_Logistique"):
        # Définition du modèle avec des hyperparamètres optimisés
        model = LogisticRegression(
            C=trial.suggest_float('C', 50, 200, log=True),
            solver=trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
            max_iter=trial.suggest_int('max_iter', 100, 1000),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores, thresholds = [], [], [], []

        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            threshold, _ = optimize_threshold(y_val, y_prob)
            thresholds.append(threshold)
            
            y_pred = (y_prob >= threshold).astype(int)
            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        # Moyenne des scores et seuil optimal
        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        optimal_threshold = np.mean(thresholds)

        # Enregistrement dans MLflow
        mlflow.log_params({"C": model.C, "solver": model.solver, "max_iter": model.max_iter, "threshold": optimal_threshold})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})
        
        # Enregistrement des résultats
        results.append({"Model": "Logistic Regression", "AUC": auc, "Accuracy": acc, "Business Score": cost, "Threshold": optimal_threshold})
        return cost

# Fonction d'optimisation pour Random Forest
def random_forest_optimization(trial):
    with mlflow.start_run(run_name="Optimisation_Random_Forest"):
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 5, 20),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
            min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores, thresholds = [], [], [], []

        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            threshold, _ = optimize_threshold(y_val, y_prob)
            thresholds.append(threshold)
            
            y_pred = (y_prob >= threshold).astype(int)
            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        optimal_threshold = np.mean(thresholds)

        # Enregistrement dans MLflow
        mlflow.log_params({"n_estimators": model.n_estimators, "max_depth": model.max_depth,
                           "min_samples_split": model.min_samples_split, "min_samples_leaf": model.min_samples_leaf,
                           "threshold": optimal_threshold})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})

        results.append({"Model": "Random Forest", "AUC": auc, "Accuracy": acc, "Business Score": cost, "Threshold": optimal_threshold})
        return cost

# Fonction d'optimisation pour LightGBM
def lightgbm_optimization(trial):
    with mlflow.start_run(run_name="Optimisation_LightGBM"):
        model = lgb.LGBMClassifier(
            num_leaves=trial.suggest_int('num_leaves', 20, 150),
            max_depth=trial.suggest_int('max_depth', 5, 20),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores, thresholds = [], [], [], []

        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            threshold, _ = optimize_threshold(y_val, y_prob)
            thresholds.append(threshold)
            
            y_pred = (y_prob >= threshold).astype(int)
            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        optimal_threshold = np.mean(thresholds)

        # Enregistrement dans MLflow
        mlflow.log_params({"num_leaves": model.num_leaves, "max_depth": model.max_depth,
                           "learning_rate": model.learning_rate, "n_estimators": model.n_estimators,
                           "threshold": optimal_threshold})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})

        results.append({"Model": "LightGBM", "AUC": auc, "Accuracy": acc, "Business Score": cost, "Threshold": optimal_threshold})
        return cost

# Optimisation des modèles
optuna.create_study(direction='minimize').optimize(logistic_regression_optimization, n_trials=nb_runs)
optuna.create_study(direction='minimize').optimize(random_forest_optimization, n_trials=nb_runs)
optuna.create_study(direction='minimize').optimize(lightgbm_optimization, n_trials=nb_runs)

# Tri des résultats pour obtenir le meilleur modèle
results_df = pd.DataFrame(results)
best_model = results_df.sort_values(by=['Business Score', 'AUC', 'Accuracy']).iloc[0]



In [None]:
# Entraînement des modèles finaux avec les meilleurs paramètres
# On récupère le meilleur modèle et les meilleurs paramètres trouvés

# Logistique : 
logreg_best_params = {
    "C": best_model['C'],
    "solver": best_model['solver'],
    "max_iter": best_model['max_iter']
}
logreg_model = LogisticRegression(**logreg_best_params)
logreg_model.fit(X_final, y_final)
y_logreg_pred = logreg_model.predict_proba(X_final)[:, 1]
logreg_threshold = best_model['Threshold']
logreg_final_pred = (y_logreg_pred >= logreg_threshold).astype(int)

# Random Forest : 
rf_best_params = {
    "n_estimators": best_model['n_estimators'],
    "max_depth": best_model['max_depth'],
    "min_samples_split": best_model['min_samples_split'],
    "min_samples_leaf": best_model['min_samples_leaf']
}
rf_model = RandomForestClassifier(**rf_best_params)
rf_model.fit(X_final, y_final)
y_rf_pred = rf_model.predict_proba(X_final)[:, 1]
rf_threshold = best_model['Threshold']
rf_final_pred = (y_rf_pred >= rf_threshold).astype(int)

# LightGBM : 
lgb_best_params = {
    "num_leaves": best_model['num_leaves'],
    "max_depth": best_model['max_depth'],
    "learning_rate": best_model['learning_rate'],
    "n_estimators": best_model['n_estimators']
}
lgb_model = lgb.LGBMClassifier(**lgb_best_params)
lgb_model.fit(X_final, y_final)
y_lgb_pred = lgb_model.predict_proba(X_final)[:, 1]
lgb_threshold = best_model['Threshold']
lgb_final_pred = (y_lgb_pred >= lgb_threshold).astype(int)

# Comparaison finale des modèles
final_results = {
    "Logistic Regression AUC": roc_auc_score(y_final, y_logreg_pred),
    "Random Forest AUC": roc_auc_score(y_final, y_rf_pred),
    "LightGBM AUC": roc_auc_score(y_final, y_lgb_pred),
}
print(final_results)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import mlflow

# Fonction pour le calcul du score de coût métier
def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp + 10 * fn

# Fonction pour optimiser le seuil
def optimize_threshold(y_true, y_prob):
    thresholds = np.linspace(0.1, 0.9, 100)
    best_threshold, best_score = 0.5, float('inf')
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        score = cost_metric(y_true, y_pred)
        if score < best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# Fonction de comparaison finale des modèles
def compare_models(y_true, model_results):
    # On calcule les scores AUC et Business pour chaque modèle
    comparison_results = []
    for model_name, model_data in model_results.items():
        # Récupération des prédictions finales et du seuil optimisé
        y_prob = model_data['model'].predict_proba(X_final)[:, 1]
        threshold = model_data['threshold']
        y_pred = (y_prob >= threshold).astype(int)
        
        # Calcul des métriques
        auc = roc_auc_score(y_true, y_prob)
        business_score = cost_metric(y_true, y_pred)
        
        # Sauvegarder les résultats pour la comparaison
        comparison_results.append({
            'Model': model_name,
            'AUC': auc,
            'Business Score': business_score,
            'Threshold': threshold
        })
    
    # Convertir les résultats en DataFrame pour un meilleur affichage
    comparison_df = pd.DataFrame(comparison_results)
    
    # Trier en fonction de AUC (descendant) et Business Score (ascendant)
    comparison_df_sorted = comparison_df.sort_values(by=['Business Score', 'AUC'], ascending=[True, False])
    
    return comparison_df_sorted

# Résultats de l'entraînement final pour chaque modèle
model_results = {}

# 1. Logistic Regression
logreg_best_params = {
    "C": 100,  # Remplacez avec les meilleurs paramètres trouvés
    "solver": 'liblinear',  # Remplacez avec les meilleurs paramètres trouvés
    "max_iter": 1000  # Remplacez avec les meilleurs paramètres trouvés
}
logreg_model = LogisticRegression(**logreg_best_params)
logreg_model.fit(X_final, y_final)
logreg_best_threshold = 0.3  # Remplacez avec le meilleur seuil trouvé
model_results['Logistic Regression'] = {
    'model': logreg_model,
    'threshold': logreg_best_threshold
}

# 2. Random Forest
rf_best_params = {
    "n_estimators": 100,  # Remplacez avec les meilleurs paramètres trouvés
    "max_depth": 10,  # Remplacez avec les meilleurs paramètres trouvés
    "min_samples_split": 2,  # Remplacez avec les meilleurs paramètres trouvés
    "min_samples_leaf": 1  # Remplacez avec les meilleurs paramètres trouvés
}
rf_model = RandomForestClassifier(**rf_best_params)
rf_model.fit(X_final, y_final)
rf_best_threshold = 0.35  # Remplacez avec le meilleur seuil trouvé
model_results['Random Forest'] = {
    'model': rf_model,
    'threshold': rf_best_threshold
}

# 3. LightGBM
lgb_best_params = {
    "num_leaves": 50,  # Remplacez avec les meilleurs paramètres trouvés
    "max_depth": 10,  # Remplacez avec les meilleurs paramètres trouvés
    "learning_rate": 0.05,  # Remplacez avec les meilleurs paramètres trouvés
    "n_estimators": 100  # Remplacez avec les meilleurs paramètres trouvés
}
lgb_model = lgb.LGBMClassifier(**lgb_best_params)
lgb_model.fit(X_final, y_final)
lgb_best_threshold = 0.3  # Remplacez avec le meilleur seuil trouvé
model_results['LightGBM'] = {
    'model': lgb_model,
    'threshold': lgb_best_threshold
}

# Comparaison des modèles
final_comparison = compare_models(y_final, model_results)

# Affichage des résultats triés
print(final_comparison)


In [None]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import optuna
import mlflow
import time
import logging

# Configuration initiale
logging.getLogger('optuna').setLevel(logging.CRITICAL)
mlflow.set_experiment("Comparaison_de_Modèles")
start_time = time.time()
results = []
nb_runs = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smote = SMOTE()

# Chargement des données
X_train_eval = pd.read_csv('X_train_eval.csv')
y_train_eval = pd.read_csv('y_train_eval.csv').values.ravel()
X_final = pd.read_csv('X_final.csv')
y_final = pd.read_csv('y_final.csv').values.ravel()

# Fonction pour le calcul du score de coût métier
def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp + 10 * fn

# Enregistrement des meilleurs modèles et hyperparamètres pour chaque modèle
best_model_results = {}

# 1. Régression Logistique
def logistic_regression_optimization(trial):
    with mlflow.start_run(run_name="Régression_Logistique"):
        # Définir le modèle avec des hyperparamètres optimisables
        model = LogisticRegression(
            C=trial.suggest_float('C', 50, 200, log=True),
            solver=trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
            max_iter=trial.suggest_int('max_iter', 100, 1000),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores = [], [], []

        # Boucle de cross-validation
        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            # Application de SMOTE
            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            y_pred = (y_prob > 0.5).astype(int)

            # Calcul des métriques
            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        # Moyenne des scores
        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        
        # Enregistrement dans MLflow
        mlflow.log_params({"C": model.C, "solver": model.solver, "max_iter": model.max_iter})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})
        
        # Enregistrement des résultats
        results.append({"Model": "Logistic Regression", "AUC": auc, "Accuracy": acc, "Business Score": cost})
        return cost

# 2. Random Forest
def random_forest_optimization(trial):
    with mlflow.start_run(run_name="Random_Forest"):
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 5, 20),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
            min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores = [], [], []

        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            y_pred = (y_prob > 0.5).astype(int)

            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        
        # Enregistrement dans MLflow
        mlflow.log_params({"n_estimators": model.n_estimators, "max_depth": model.max_depth,
                           "min_samples_split": model.min_samples_split, "min_samples_leaf": model.min_samples_leaf})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})

        results.append({"Model": "Random Forest", "AUC": auc, "Accuracy": acc, "Business Score": cost})
        return cost

# 3. LightGBM
def lightgbm_optimization(trial):
    with mlflow.start_run(run_name="LightGBM"):
        model = lgb.LGBMClassifier(
            num_leaves=trial.suggest_int('num_leaves', 20, 150),
            max_depth=trial.suggest_int('max_depth', 5, 20),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            random_state=42
        )
        
        auc_scores, acc_scores, cost_scores = [], [], []

        for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
            X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
            y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            model.fit(X_train_smote, y_train_smote)

            y_prob = model.predict_proba(X_val)[:, 1]
            y_pred = (y_prob > 0.5).astype(int)

            auc_scores.append(roc_auc_score(y_val, y_prob))
            acc_scores.append(accuracy_score(y_val, y_pred))
            cost_scores.append(cost_metric(y_val, y_pred))

        auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
        
        # Enregistrement dans MLflow
        mlflow.log_params({"num_leaves": model.num_leaves, "max_depth": model.max_depth,
                           "learning_rate": model.learning_rate, "n_estimators": model.n_estimators})
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business_Score": cost})

        results.append({"Model": "LightGBM", "AUC": auc, "Accuracy": acc, "Business Score": cost})
        return cost

# Optimisation des modèles avec Optuna
print("Optimisation : Logistic Regression")
optuna.create_study(direction='minimize').optimize(logistic_regression_optimization, n_trials=nb_runs)

print("Optimisation : Random Forest")
optuna.create_study(direction='minimize').optimize(random_forest_optimization, n_trials=nb_runs)

print("Optimisation : LightGBM")
optuna.create_study(direction='minimize').optimize(lightgbm_optimization, n_trials=nb_runs)

# Résultats finaux et modèle recommandé
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by=['Business Score', 'AUC', 'Accuracy'], ascending=[True, False, False])
print("\nTop Modèles : Comparaison:\n", results_df_sorted.head(3))

best_model = results_df_sorted.iloc[0]
print(f"\nLe modèle recommandé pour la production est : {best_model['Model']} avec AUC: {best_model['AUC']}, Accuracy: {best_model['Accuracy']}, et Business Score: {best_model['Business Score']}")


In [None]:
#import des bibliothèques 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import optuna
import mlflow
import joblib
import time
import matplotlib.pyplot as plt
import logging

#la configuration initiale
logging.getLogger('optuna').setLevel(logging.CRITICAL)
mlflow.set_experiment("Modèles_Comparaison")
start_time = time.time()
results = []
nb_runs = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smote = SMOTE()

#chargement des données
X_train_eval = pd.read_csv('X_train_eval.csv')
y_train_eval = pd.read_csv('y_train_eval.csv').values.ravel()
X_final = pd.read_csv('X_final.csv')
y_final = pd.read_csv('y_final.csv').values.ravel()

# Fonction pour le calcul du score de coût métier
def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp + 10 * fn

# Enregistrement des meilleurs modèles et hyperparamètres pour chaque modèle
best_model_results = {}

# 1. Régression Logistique
def logistic_regression_optimization(trial):
    model = LogisticRegression(
        C=trial.suggest_float('C', 50, 200, log=True),
        solver=trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
        max_iter=trial.suggest_int('max_iter', 100, 1000),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "Logistic Regression", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# 2. Random Forest
def random_forest_optimization(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 5, 20),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "Random Forest", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# 3. LightGBM
def lightgbm_optimization(trial):
    model = lgb.LGBMClassifier(
        num_leaves=trial.suggest_int('num_leaves', 20, 150),
        max_depth=trial.suggest_int('max_depth', 5, 20),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "LightGBM", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# Optimisation des modèles
print("Optimizing Logistic Regression")
optuna.create_study(direction='minimize').optimize(logistic_regression_optimization, n_trials=nb_runs)

print("Optimizing Random Forest")
optuna.create_study(direction='minimize').optimize(random_forest_optimization, n_trials=nb_runs)

print("Optimizing LightGBM")
optuna.create_study(direction='minimize').optimize(lightgbm_optimization, n_trials=nb_runs)

# Enregistrement des résultats et tri pour obtenir le meilleur modèle
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by=['Business Score', 'AUC', 'Accuracy'], ascending=[True, False, False])
print("\nTop Models Comparison:\n", results_df_sorted.head(3))

# Affichage du modèle recommandé
best_model = results_df_sorted.iloc[0]
print(f"\nLe modèle recommandé pour la production est {best_model['Model']} avec AUC: {best_model['AUC']}, Accuracy: {best_model['Accuracy']}, et Business Score: {best_model['Business Score']}")