In [None]:
#import des bibliothèques 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import optuna
import mlflow
import joblib
import time
import matplotlib.pyplot as plt
import logging

#la configuration initiale
logging.getLogger('optuna').setLevel(logging.CRITICAL)
mlflow.set_experiment("Model_Comparison")
start_time = time.time()
results = []
nb_runs = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smote = SMOTE()

#chargement des données
X_train_eval = pd.read_csv('X_train_eval.csv')
y_train_eval = pd.read_csv('y_train_eval.csv').values.ravel()
X_final = pd.read_csv('X_final.csv')
y_final = pd.read_csv('y_final.csv').values.ravel()

# Fonction pour le calcul du score de coût métier
def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp + 10 * fn

# Enregistrement des meilleurs modèles et hyperparamètres pour chaque modèle
best_model_results = {}

# 1. Régression Logistique
def logistic_regression_optimization(trial):
    model = LogisticRegression(
        C=trial.suggest_float('C', 50, 200, log=True),
        solver=trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
        max_iter=trial.suggest_int('max_iter', 100, 1000),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "Logistic Regression", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# 2. Random Forest
def random_forest_optimization(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 5, 20),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "Random Forest", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# 3. LightGBM
def lightgbm_optimization(trial):
    model = lgb.LGBMClassifier(
        num_leaves=trial.suggest_int('num_leaves', 20, 150),
        max_depth=trial.suggest_int('max_depth', 5, 20),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        random_state=42
    )
    auc_scores, acc_scores, cost_scores = [], [], []

    for train_idx, val_idx in cv.split(X_train_eval, y_train_eval):
        X_train, X_val = X_train_eval.iloc[train_idx], X_train_eval.iloc[val_idx]
        y_train, y_val = y_train_eval[train_idx], y_train_eval[val_idx]

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val, y_prob))
        acc_scores.append(accuracy_score(y_val, y_pred))
        cost_scores.append(cost_metric(y_val, y_pred))

    auc, acc, cost = np.mean(auc_scores), np.mean(acc_scores), np.mean(cost_scores)
    results.append({"Model": "LightGBM", "AUC": auc, "Accuracy": acc, "Business Score": cost})
    return cost

# Optimisation des modèles
print("Optimizing Logistic Regression")
optuna.create_study(direction='minimize').optimize(logistic_regression_optimization, n_trials=nb_runs)

print("Optimizing Random Forest")
optuna.create_study(direction='minimize').optimize(random_forest_optimization, n_trials=nb_runs)

print("Optimizing LightGBM")
optuna.create_study(direction='minimize').optimize(lightgbm_optimization, n_trials=nb_runs)

# Enregistrement des résultats et tri pour obtenir le meilleur modèle
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by=['Business Score', 'AUC', 'Accuracy'], ascending=[True, False, False])
print("\nTop Models Comparison:\n", results_df_sorted.head(3))

# Affichage du modèle recommandé
best_model = results_df_sorted.iloc[0]
print(f"\nLe modèle recommandé pour la production est {best_model['Model']} avec AUC: {best_model['AUC']}, Accuracy: {best_model['Accuracy']}, et Business Score: {best_model['Business Score']}")

In [None]:
from sklearn.model_selection import train_test_split

# Chargement des données depuis `df_clean_imputed`
# Assumant que `df_clean_imputed` contient les features et une colonne 'target' pour la cible
df_clean_imputed = pd.read_csv('df_clean_imputed.csv')  # Adapter avec le chemin correct

# Séparer les features (X) et la cible (y)
X = df_clean_imputed.drop(columns=['target'])  # Remplacer 'target' par le nom réel de la colonne cible
y = df_clean_imputed['target']

# Division stratifiée avec 1% pour X_train_eval et le reste pour X_final
X_train_eval, X_final, y_train_eval, y_final = train_test_split(
    X, y,
    train_size=0.01,  # 1% pour l'entraînement et la validation
    stratify=y,
    random_state=42
)

# Vérification des dimensions pour confirmation
print("Taille de X_train_eval:", X_train_eval.shape)
print("Taille de y_train_eval:", y_train_eval.shape)
print("Taille de X_final:", X_final.shape)
print("Taille de y_final:", y_final.shape)

# À partir d'ici, continuez en utilisant X_train_eval, y_train_eval pour le modèle et X_final, y_final pour l'évaluation finale.
