In [None]:
#import des bibliothèques 

import pandas as pd 
from sklearn.model_selection import train_test_split 

# Chargement des données depuis `df_clean_imputed`
df_clean_imputed = pd.read_csv('df_clean_imputed.csv')  

# Séparer les features (X) et la cible (y)
X = df_clean_imputed.drop(columns=['TARGET'])  
y = df_clean_imputed['TARGET']

# Première division : 1% pour l'entraînement et validation (X_train_eval, y_train_eval) et 99% pour le reste
X_train_eval, X_remaining, y_train_eval, y_remaining = train_test_split(
    X, y,
    train_size=0.01,
    stratify=y,
    random_state=42
)

# Deuxième division : 1% pour l'API (X_api, y_api) et 98% pour le jeu final
X_api, X_final, y_api, y_final = train_test_split(
    X_remaining, y_remaining,
    train_size=0.01 / 0.99,  # Calculé pour obtenir 1% de l'original sur les données restantes
    stratify=y_remaining,
    random_state=42
)

# Vérification des dimensions pour confirmation
print("Taille de X_train_eval:", X_train_eval.shape)
print("Taille de y_train_eval:", y_train_eval.shape)
print("Taille de X_api:", X_api.shape)
print("Taille de y_api:", y_api.shape)
print("Taille de X_final:", X_final.shape)
print("Taille de y_final:", y_final.shape)

# X_train_eval et y_train_eval : Utilisés pour l'entraînement et la validation des modèles
# X_api et y_api : Échantillon pour tester l'API
# X_final et y_final : Jeu de données final pour évaluation


  from .autonotebook import tqdm as notebook_tqdm


Taille de X_train_eval: (3075, 625)
Taille de y_train_eval: (3075,)
Taille de X_api: (3075, 625)
Taille de y_api: (3075,)
Taille de X_final: (301357, 625)
Taille de y_final: (301357,)


In [2]:
X_train_eval.to_csv('X_train_eval2.csv', index=False)
y_train_eval.to_csv('y_train_eval2.csv', index=False)
X_api.to_csv('X_api.csv', index=False)
y_api.to_csv('y_api.csv', index=False)
X_final.to_csv('X_final2.csv', index=False)
y_final.to_csv('y_final2.csv', index=False)

In [8]:
#2 eme essai
import time
import logging
import gc
import mlflow
import optuna
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Initialisation
smote = SMOTE()
cv = StratifiedKFold(n_splits=5)
results = []
nb_runs = 15
logging.getLogger('optuna').setLevel(logging.CRITICAL)

# Chronomètre d'entraînement
start_time = time.time()

# Définir l'expérience MLflow
mlflow.set_experiment('Logistic_Regression')

# Chargement des fichiers  :
X_train_eval = pd.read_csv('X_train_eval2.csv')
y_train_eval = pd.read_csv('y_train_eval2.csv')
X_final = pd.read_csv('X_final2.csv')
y_final = pd.read_csv('y_final2.csv')
X_api = pd.read_csv('X_api.csv')
y_api = pd.read_csv('y_api.csv')

# Standardisation des données
scaler = StandardScaler()

# Applique la transformation sur les données d'entraînement et de test
X_train_eval_scaled = scaler.fit_transform(X_train_eval)
X_final_scaled = scaler.transform(X_final)

# Sauvegarde du scaler pour réutilisation ultérieure
#joblib.dump(scaler, 'scaler.joblib')

# Conversion en float32 pour économiser la mémoire
X_final_scaled = X_final_scaled.astype(np.float32)

# Fonction de régression logistique pour l'optimisation Optuna
def logistic_r(trial):
    model = LogisticRegression(
        C=trial.suggest_float('C', 50, 200, log=True),
        solver='saga',
        max_iter=trial.suggest_int('max_iter', 100, 1000),
        random_state=42,
        n_jobs=-1
    )

    auc_scores, acc_scores, cost_scores = [], [], []

    # Validation croisée avec suréchantillonnage SMOTE
    for train_idx, test_idx in cv.split(X_train_eval_scaled, y_train_eval):
        X_train, X_test = X_train_eval_scaled[train_idx], X_train_eval_scaled[test_idx]
        y_train, y_test = y_train_eval.iloc[train_idx], y_train_eval.iloc[test_idx]
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = y_prob > 0.5

        # Calcul des métriques
        auc_scores.append(roc_auc_score(y_test, y_prob))
        acc_scores.append(accuracy_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        cost_scores.append(fp + 10 * fn)

    auc = np.mean(auc_scores)
    acc = np.mean(acc_scores)
    cost = np.mean(cost_scores)

    results.append({"AUC": auc, "Accuracy": acc, "Business Score": cost})

    # Enregistrement dans MLflow
    with mlflow.start_run():
        mlflow.log_params(trial.params)
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business Score": cost})
        
        # Courbe ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(10, 7))
        plt.plot(fpr, tpr, label=f'AUC: {auc:.2f}')
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig("roc_curve_lr.png")
        plt.close()
        mlflow.log_artifact("roc_curve_lr.png")

        mlflow.sklearn.log_model(model, "logistic_regression_model")

    gc.collect()
    return cost

# Optimisation avec Optuna
study_lr = optuna.create_study(direction='minimize')
study_lr.optimize(logistic_r, n_trials=nb_runs)

# Affichage du temps d'entraînement
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Meilleurs résultats et paramètres
best_params_lr = study_lr.best_params
best_auc_lr = study_lr.best_value
best_acc_lr = max([res['Accuracy'] for res in results])
best_cost_lr = min([res['Business Score'] for res in results])

# Affichage des meilleurs paramètres
print(study_lr.best_params)
with mlflow.start_run():
    mlflow.log_params(best_params_lr)
joblib.dump(best_params_lr, 'best_params_lr.pkl')

# Entraînement final avec les meilleurs paramètres
start_time = time.time()
final_model_lr = LogisticRegression(**best_params_lr, random_state=42)
X_train_final_smote_lr, y_train_final_smote_lr = smote.fit_resample(X_final_scaled, y_final)
final_model_lr.fit(X_train_final_smote_lr, y_train_final_smote_lr)

# Affichage du temps d'entraînement final
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Application du même scaler sur X_api (données de test final) avant la prédiction
X_api_scaled = scaler.transform(X_api)

# Prédictions avec le modèle final
y_prob_final_lr = final_model_lr.predict_proba(X_api_scaled)[:, 1]
y_pred_final_lr = y_prob_final_lr > 0.5

# Enregistrement du modèle final
joblib.dump(final_model_lr, 'logistic_regression_model_f.joblib')

# Sauvegarde des résultats
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by='Accuracy', ascending=False)
results_df.to_csv('results_logistic_regression.csv', index=False)

print("Les résultats ont été exportés dans 'results_logistic_regression.csv'")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Temps d'exécution total: 1568.40 secondes
{'C': 50.90201045762875, 'max_iter': 102}


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Temps d'exécution total: 154.30 secondes
Les résultats ont été exportés dans 'results_logistic_regression.csv'


In [9]:
# Prédictions finales et métriques
#y_prob_final_lr = final_model_lr.predict_proba(X_api_scaled)[:, 1]
#y_pred_final_lr = y_prob_final_lr > 0.5

# Calcul des métriques finales
final_auc = roc_auc_score(y_api, y_prob_final_lr)
final_accuracy = accuracy_score(y_api, y_pred_final_lr)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_lr).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle logistic_regression final avec les meilleurs paramètres:")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({"Final AUC logistic_regression": final_auc, "Final Accuracy logistic_regression": final_accuracy, "Final Business Score logistic_regression": final_cost})


Métriques du modèle logistic_regression final avec les meilleurs paramètres:
AUC: 0.7066
Accuracy: 0.5447
Business Score: 1904.0000


In [10]:
#2 eme essai
#trier par Business Score
results_df_sorted = results_df.sort_values(by='Business Score', ascending=False)
results_df_sorted

Unnamed: 0,AUC,Accuracy,Business Score
9,0.684334,0.790244,415.2
4,0.681837,0.798374,412.0
3,0.684937,0.793171,411.6
1,0.68214,0.796748,411.2
0,0.684509,0.798374,408.4
6,0.683714,0.798699,408.2
7,0.687786,0.790569,406.0
2,0.688162,0.79122,405.6
13,0.688434,0.798049,403.2
5,0.691538,0.791545,398.2


In [None]:
import time
import logging
import gc
import mlflow
import optuna
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Initialisation
smote = SMOTE()
cv = StratifiedKFold(n_splits=5)
results = []
nb_runs = 15
logging.getLogger('optuna').setLevel(logging.CRITICAL)

# Chronomètre d'entraînement
start_time = time.time()

# Définir l'expérience MLflow
mlflow.set_experiment('Logistic_Regression')

# Chargement des fichiers  :
X_train_eval = pd.read_csv('X_train_eval2.csv')
y_train_eval = pd.read_csv('y_train_eval2.csv')
X_final = pd.read_csv('X_final2.csv')
y_final = pd.read_csv('y_final2.csv')
X_api = pd.read_csv('X_api.csv')
y_api = pd.read_csv('y_api.csv')

# Standardisation des données
scaler = StandardScaler()

# Applique la transformation sur les données d'entraînement et de test
X_train_eval_scaled = scaler.fit_transform(X_train_eval)
X_final_scaled = scaler.transform(X_final)

# Sauvegarde du scaler pour réutilisation ultérieure
#joblib.dump(scaler, 'scaler.joblib')

# Conversion en float32 pour économiser la mémoire
X_final_scaled = X_final_scaled.astype(np.float32)

# Fonction de régression logistique pour l'optimisation Optuna
def logistic_r(trial):
    model = LogisticRegression(
        C=trial.suggest_float('C', 50, 200, log=True),
        solver='saga',
        max_iter=trial.suggest_int('max_iter', 100, 1000),
        random_state=42,
        n_jobs=-1
    )

    auc_scores, acc_scores, cost_scores = [], [], []

    # Validation croisée avec suréchantillonnage SMOTE
    for train_idx, test_idx in cv.split(X_train_eval_scaled, y_train_eval):
        X_train, X_test = X_train_eval_scaled[train_idx], X_train_eval_scaled[test_idx]
        y_train, y_test = y_train_eval.iloc[train_idx], y_train_eval.iloc[test_idx]
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = y_prob > 0.5

        # Calcul des métriques
        auc_scores.append(roc_auc_score(y_test, y_prob))
        acc_scores.append(accuracy_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        cost_scores.append(fp + 10 * fn)

    auc = np.mean(auc_scores)
    acc = np.mean(acc_scores)
    cost = np.mean(cost_scores)

    results.append({"AUC": auc, "Accuracy": acc, "Business Score": cost})

    # Enregistrement dans MLflow
    with mlflow.start_run():
        mlflow.log_params(trial.params)
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business Score": cost})
        
        # Courbe ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(10, 7))
        plt.plot(fpr, tpr, label=f'AUC: {auc:.2f}')
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig("roc_curve_lr.png")
        plt.close()
        mlflow.log_artifact("roc_curve_lr.png")

        mlflow.sklearn.log_model(model, "logistic_regression_model")

    gc.collect()
    return cost

# Optimisation avec Optuna
study_lr = optuna.create_study(direction='minimize')
study_lr.optimize(logistic_r, n_trials=nb_runs)

# Affichage du temps d'entraînement
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Meilleurs résultats et paramètres
best_params_lr = study_lr.best_params
best_auc_lr = study_lr.best_value
best_acc_lr = max([res['Accuracy'] for res in results])
best_cost_lr = min([res['Business Score'] for res in results])

# Affichage des meilleurs paramètres
print(study_lr.best_params)
with mlflow.start_run():
    mlflow.log_params(best_params_lr)
joblib.dump(best_params_lr, 'best_params_lr.pkl')

# Entraînement final avec les meilleurs paramètres
start_time = time.time()
final_model_lr = LogisticRegression(**best_params_lr, random_state=42)
X_train_final_smote_lr, y_train_final_smote_lr = smote.fit_resample(X_final_scaled, y_final)
final_model_lr.fit(X_train_final_smote_lr, y_train_final_smote_lr)

# Affichage du temps d'entraînement final
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Application du même scaler sur X_api (données de test final) avant la prédiction
X_api_scaled = scaler.transform(X_api)

# Prédictions avec le modèle final
y_prob_final_lr = final_model_lr.predict_proba(X_api_scaled)[:, 1]
y_pred_final_lr = y_prob_final_lr > 0.5

# Enregistrement du modèle final
joblib.dump(final_model_lr, 'logistic_regression_model_f.joblib')

# Sauvegarde des résultats
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by='Accuracy', ascending=False)
results_df.to_csv('results_logistic_regression.csv', index=False)

print("Les résultats ont été exportés dans 'results_logistic_regression.csv'")


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown 

Temps d'exécution total: 2092.88 secondes
{'C': 198.33101299775342, 'solver': 'liblinear', 'max_iter': 638}


  y = column_or_1d(y, warn=True)


Temps d'exécution total: 8866.44 secondes
Les résultats ont été exportés dans 'results_logistic_regression.csv'


In [12]:
# Prédictions finales et métriques
#y_prob_final_lr = final_model_lr.predict_proba(X_api_scaled)[:, 1]
#y_pred_final_lr = y_prob_final_lr > 0.5

# Calcul des métriques finales
final_auc = roc_auc_score(y_api, y_prob_final_lr)
final_accuracy = accuracy_score(y_api, y_pred_final_lr)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_lr).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle logistic_regression final avec les meilleurs paramètres:")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({"Final AUC logistic_regression": final_auc, "Final Accuracy logistic_regression": final_accuracy, "Final Business Score logistic_regression": final_cost})


Métriques du modèle logistic_regression final avec les meilleurs paramètres:
AUC: 0.7534
Accuracy: 0.6059
Business Score: 1716.0000


In [None]:
results_df

Unnamed: 0,AUC,Accuracy,Business Score
0,0.637121,0.818211,426.8
1,0.64306,0.814634,418.2
2,0.646018,0.812358,426.8
3,0.640788,0.819837,427.6
4,0.646863,0.813333,426.2
5,0.646549,0.813984,422.2
6,0.647015,0.814309,420.2
7,0.639624,0.818211,426.8
8,0.641552,0.814309,416.6
9,0.64721,0.814634,421.8


In [10]:
#trier par AUC
results_df_sorted = results_df.sort_values(by='AUC', ascending=False)
results_df_sorted

Unnamed: 0,AUC,Accuracy,Business Score
9,0.64721,0.814634,421.8
6,0.647015,0.814309,420.2
4,0.646863,0.813333,426.2
5,0.646549,0.813984,422.2
2,0.646018,0.812358,426.8
14,0.643984,0.812683,421.2
1,0.64306,0.814634,418.2
12,0.642031,0.813984,418.6
13,0.641975,0.814634,416.4
8,0.641552,0.814309,416.6


In [None]:
#trier par Business Score
results_df_sorted = results_df.sort_values(by='Business Score', ascending=False)
results_df_sorted

In [3]:
import time
import logging
import gc
import mlflow
import optuna
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Initialisation
smote = SMOTE()
cv = StratifiedKFold(n_splits=5)
results_rf = []
nb_runs = 15
logging.getLogger('optuna').setLevel(logging.CRITICAL)

# Chronomètre d'entraînement
start_time = time.time()

# Définir l'expérience MLflow pour la forêt aléatoire
mlflow.set_experiment('Random_Forest')

# Chargement des fichiers
#X_train_eval = pd.read_csv('X_train_eval2.csv')
#y_train_eval = pd.read_csv('y_train_eval2.csv')
#X_final = pd.read_csv('X_final2.csv')
#y_final = pd.read_csv('y_final2.csv')
#X_api = pd.read_csv('X_api.csv')

y_train_eval = y_train_eval.iloc[:, 0].values.ravel()  # Récupère la première colonne et aplatit
y_final = y_final.iloc[:, 0].values.ravel()

# Standardisation des données
scaler = StandardScaler()
X_train_eval_scaled = scaler.fit_transform(X_train_eval)
X_final_scaled = scaler.transform(X_final)
#joblib.dump(scaler, 'scaler.joblib')

# Conversion en float32 pour économiser la mémoire
X_final_scaled = X_final_scaled.astype(np.float32)

# Fonction de forêt aléatoire pour l'optimisation Optuna
def random_forest_r(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 100, 200, log=True),
        max_depth=trial.suggest_int('max_depth', 5, 30),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        random_state=42,
        n_jobs=-1  #tous les cœurs pour l'entraînement parallèle
    )

    auc_scores, acc_scores, cost_scores = [], [], []

    # Validation croisée avec suréchantillonnage SMOTE
    for train_idx, test_idx in cv.split(X_train_eval_scaled, y_train_eval):
        X_train, X_test = X_train_eval_scaled[train_idx], X_train_eval_scaled[test_idx]
        y_train, y_test = y_train_eval[train_idx], y_train_eval[test_idx]
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model.fit(X_train_smote, y_train_smote)
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = y_prob > 0.5

        # Calcul des métriques
        auc_scores.append(roc_auc_score(y_test, y_prob))
        acc_scores.append(accuracy_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        cost_scores.append(fp + 10 * fn)

    auc = np.mean(auc_scores)
    acc = np.mean(acc_scores)
    cost = np.mean(cost_scores)

    results_rf.append({"AUC": auc, "Accuracy": acc, "Business Score": cost})

    # Enregistrement dans MLflow
    with mlflow.start_run():
        mlflow.log_params(trial.params)
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business Score": cost})
        
        # Courbe ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(10, 7))
        plt.plot(fpr, tpr, label=f'AUC: {auc:.2f}')
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig("roc_curve_rf.png")
        plt.close()
        mlflow.log_artifact("roc_curve_rf.png")

        mlflow.sklearn.log_model(model, "random_forest_model", input_example=X_train_eval_scaled[:5])


    gc.collect()
    return cost

# Optimisation avec Optuna
study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(random_forest_r, n_trials=nb_runs)

# Affichage du temps d'entraînement
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Meilleurs résultats et paramètres
best_params_rf = study_rf.best_params
best_auc_rf = study_rf.best_value
best_acc_rf = max([res['Accuracy'] for res in results_rf])
best_cost_rf = min([res['Business Score'] for res in results_rf])

# Affichage des meilleurs paramètres
print(study_rf.best_params)
with mlflow.start_run():
    mlflow.log_params(best_params_rf)
joblib.dump(best_params_rf, 'best_params_rf.pkl')

# Entraînement final avec les meilleurs paramètres
start_time = time.time()
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
X_train_final_smote_rf, y_train_final_smote_rf = smote.fit_resample(X_final_scaled, y_final)
final_model_rf.fit(X_train_final_smote_rf, y_train_final_smote_rf)

# Affichage du temps d'entraînement final
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Application du même scaler sur X_api (données de test final) avant la prédiction
X_api_scaled = scaler.transform(X_api)

# Prédictions avec le modèle final
y_prob_final_rf = final_model_rf.predict_proba(X_api_scaled)[:, 1]
y_pred_final_rf = y_prob_final_rf > 0.5

# Enregistrement du modèle final
joblib.dump(final_model_rf, 'random_forest_model_f.joblib')

# Sauvegarde des résultats
results_df_rf = pd.DataFrame(results_rf)
results_df_rf_sorted = results_df_rf.sort_values(by='Accuracy', ascending=False)
results_df_rf.to_csv('results_random_forest.csv', index=False)

print("Les résultats ont été exportés dans 'results_random_forest.csv'")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.24it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 413.94it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 916.82it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 350.67it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 389.84it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 422.11it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1050.04it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1169.68it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 398.13it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]


Temps d'exécution total: 2116.54 secondes
{'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 10, 'max_features': 'log2'}
Temps d'exécution total: 315.80 secondes
Les résultats ont été exportés dans 'results_random_forest.csv'


In [5]:
# Calcul des métriques finales
y_api = pd.read_csv('y_api.csv')

final_auc = roc_auc_score(y_api, y_prob_final_rf)
final_accuracy = accuracy_score(y_api, y_pred_final_rf)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_rf).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle random_forest final avec les meilleurs paramètres:")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({"Final AUC random_forest": final_auc, "Final Accuracy random_forest": final_accuracy, "Final Business Score random_forest": final_cost})


Métriques du modèle random_forest final avec les meilleurs paramètres:
AUC: 0.6436
Accuracy: 0.8117
Business Score: 2190.0000


In [6]:
#trier par Business Score
results_df_rf_sorted = results_df_rf.sort_values(by='Business Score', ascending=False)
results_df_rf_sorted

Unnamed: 0,AUC,Accuracy,Business Score
4,0.694783,0.913821,490.4
9,0.697098,0.91252,489.4
6,0.708584,0.914797,488.0
7,0.701605,0.914472,486.4
13,0.694973,0.908293,483.0
0,0.694018,0.909268,478.8
14,0.699841,0.902439,477.6
3,0.672151,0.909593,471.4
1,0.685633,0.894959,469.6
5,0.681303,0.905691,464.8


In [4]:
print(X_train_eval.shape)
print(y_train_eval.shape)


(3075, 625)
(3075, 1)


In [1]:
import time
import logging
import gc
import mlflow
import optuna
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Initialisation
smote = SMOTE(random_state=42)
cv = StratifiedKFold(n_splits=5)
results_l = []
nb_runs = 15
logging.getLogger('optuna').setLevel(logging.CRITICAL)

# Chronomètre d'entraînement
start_time = time.time()

# Définir l'expérience MLflow pour LightGBM
mlflow.set_experiment('LightGBM')

# Chargement des fichiers
X_train_eval = pd.read_csv('X_train_eval2.csv')
y_train_eval = pd.read_csv('y_train_eval2.csv')
X_final = pd.read_csv('X_final2.csv')
y_final = pd.read_csv('y_final2.csv')
X_api = pd.read_csv('X_api.csv')

# Standardisation des données
scaler = StandardScaler()
X_train_eval_scaled = scaler.fit_transform(X_train_eval)
X_final_scaled = scaler.transform(X_final)
#joblib.dump(scaler, 'scaler.joblib')

# Conversion en float32 pour économiser la mémoire
X_final_scaled = X_final_scaled.astype(np.float32)

y_train_eval = y_train_eval.values.ravel()  

# Fonction de LightGBM pour l'optimisation Optuna
def lightgbm_r(trial):
    model_lgbm = LGBMClassifier(
        force_col_wise=True,
        n_estimators=trial.suggest_int('n_estimators', 100, 1000, log=True),
        learning_rate=trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        num_leaves=trial.suggest_int('num_leaves', 30, 50),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        min_child_samples=trial.suggest_int('min_child_samples', 10, 100),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        random_state=42,
        n_jobs=-1
    )

    auc_scores, acc_scores, cost_scores = [], [], []

    # Validation croisée avec suréchantillonnage SMOTE
    for train_idx, test_idx in cv.split(X_train_eval_scaled, y_train_eval):
        X_train, X_test = X_train_eval_scaled[train_idx], X_train_eval_scaled[test_idx]
        y_train, y_test = y_train_eval[train_idx], y_train_eval[test_idx]
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        model_lgbm.fit(X_train_smote, y_train_smote)
        y_prob = model_lgbm.predict_proba(X_test)[:, 1]
        y_pred = y_prob > 0.5

        # Calcul des métriques
        auc_scores.append(roc_auc_score(y_test, y_prob))
        acc_scores.append(accuracy_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        cost_scores.append(fp + 10 * fn)

    auc = np.mean(auc_scores)
    acc = np.mean(acc_scores)
    cost = np.mean(cost_scores)

    results_l.append({"AUC": auc, "Accuracy": acc, "Business Score": cost})

    # Enregistrement dans MLflow
    with mlflow.start_run():
        mlflow.log_params(trial.params)
        mlflow.log_metrics({"AUC": auc, "Accuracy": acc, "Business Score": cost})
        
        # Courbe ROC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(10, 7))
        plt.plot(fpr, tpr, label=f'AUC: {auc:.2f}')
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig("roc_curve_lgbm.png")
        plt.close()
        mlflow.log_artifact("roc_curve_lgbm.png")

        mlflow.lightgbm.log_model(model_lgbm, "lightgbm_model", input_example=X_train_eval_scaled[:5])

    gc.collect()
    return cost

# Optimisation avec Optuna
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(lightgbm_r, n_trials=nb_runs)

# Affichage du temps d'entraînement
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Meilleurs résultats et paramètres
best_params_lgbm = study_lgbm.best_params
best_auc_lgbm = study_lgbm.best_value
best_acc_lgbm = max([res['Accuracy'] for res in results_l])
best_cost_lgbm = min([res['Business Score'] for res in results_l])

# Affichage des meilleurs paramètres
print(study_lgbm.best_params)
with mlflow.start_run():
    mlflow.log_params(best_params_lgbm)
joblib.dump(best_params_lgbm, 'best_params_lgbm.pkl')

# Entraînement final avec les meilleurs paramètres
start_time = time.time()
final_model_lgbm = LGBMClassifier(**best_params_lgbm, random_state=42, n_jobs=-1)
X_train_final_smote_lgbm, y_train_final_smote_lgbm = smote.fit_resample(X_final_scaled, y_final)
final_model_lgbm.fit(X_train_final_smote_lgbm, y_train_final_smote_lgbm)

# Affichage du temps d'entraînement final
end_time = time.time()
print(f"Temps d'exécution total: {end_time - start_time:.2f} secondes")

# Application du même scaler sur X_api (données de test final) avant la prédiction
X_api_scaled = scaler.transform(X_api)

# Prédictions avec le modèle final
y_prob_final_lgbm = final_model_lgbm.predict_proba(X_api_scaled)[:, 1]
y_pred_final_lgbm = y_prob_final_lgbm > 0.5

# Enregistrement du modèle final
joblib.dump(final_model_lgbm, 'lightgbm_model_f.joblib')

# Sauvegarde des résultats
results_df_l = pd.DataFrame(results_l)
results_df_l_sorted = results_df_l.sort_values(by='Accuracy', ascending=False)
results_df_l.to_csv('results_lightgbm.csv', index=False)

print("Les résultats ont été exportés dans 'results_lightgbm.csv'")


  from .autonotebook import tqdm as notebook_tqdm
2024/11/07 22:00:40 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103960
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 548
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104206
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 550
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104406
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 548
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104411
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1000.75it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103978
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 550
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104215
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 551
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104409
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 549
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104420
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1167.45it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103784
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 532
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104015
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 532
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104216
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 534
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104230
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1000.52it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103658
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 525
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103944
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 527
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104177
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 531
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104110
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.84it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104041
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 558
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104265
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 559
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104447
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 556
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104464
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1000.79it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103895
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 541
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104110
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 540
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104333
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 542
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104302
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1000.28it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103823
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 536
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104015
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 532
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104278
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 538
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104262
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1750.75it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104115
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 572
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104352
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 574
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104531
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 571
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104561
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1167.68it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103679
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 526
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103967
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 528
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104197
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 532
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104153
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 875.48it/s] 


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103696
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 527
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103967
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 528
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104197
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 532
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104153
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.84it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104132
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 576
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104378
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 581
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104562
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 579
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104590
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.77it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103988
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 551
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104215
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 551
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104416
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 550
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104420
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 778.25it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104016
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 555
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104244
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 555
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104423
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 551
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104450
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.57it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104055
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 561
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104265
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 559
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104450
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 557
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104464
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.90it/s]


[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 103913
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 543
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2261, number of negative: 2261
[LightGBM] [Info] Total Bins 104134
[LightGBM] [Info] Number of data points in the train set: 4522, number of used features: 542
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104372
[LightGBM] [Info] Number of data points in the train set: 4524, number of used features: 545
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2262, number of negative: 2262
[LightGBM] [Info] Total Bins 104355
[LightGBM] [Info] Number of data points in the train set: 4524, 

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 875.61it/s] 


Temps d'exécution total: 1126.67 secondes
{'n_estimators': 473, 'learning_rate': 0.001033040024280201, 'num_leaves': 30, 'max_depth': 3, 'min_child_samples': 30, 'subsample': 0.7935383680418213, 'colsample_bytree': 0.7864896193162279}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 277028, number of negative: 277028
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.465672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142264
[LightGBM] [Info] Number of data points in the train set: 554056, number of used features: 618
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Temps d'exécution total: 246.19 secondes
Les résultats ont été exportés dans 'results_lightgbm.csv'


In [2]:
# Calcul des métriques finales
y_api = pd.read_csv('y_api.csv')

# Calcul des métriques finales pour lgbm
final_auc = roc_auc_score(y_api, y_prob_final_lgbm)
final_accuracy = accuracy_score(y_api, y_pred_final_lgbm)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_lgbm).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle lightgbm final avec les meilleurs paramètres:")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({"Final AUC lightgbm": final_auc, "Final Accuracy lightgbm": final_accuracy, "Final Business Score lightgbm": final_cost})


Métriques du modèle lightgbm final avec les meilleurs paramètres:
AUC: 0.6556
Accuracy: 0.8098
Business Score: 2214.0000


In [3]:
#trier par Business Score
results_df_l_sorted = results_df_l.sort_values(by='Business Score', ascending=False)
results_df_l_sorted

Unnamed: 0,AUC,Accuracy,Business Score
1,0.718332,0.917398,490.0
6,0.719976,0.915122,486.0
7,0.723941,0.915447,484.0
0,0.726774,0.902764,477.4
10,0.711584,0.916748,476.0
5,0.70254,0.914797,468.2
13,0.691072,0.88748,458.0
2,0.68345,0.887154,456.4
8,0.688696,0.88813,452.2
3,0.670303,0.87187,451.4


In [4]:
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

# Chargement des probabilités prédites et des vraies valeurs
y_prob_final_lgbm = final_model_lgbm.predict_proba(X_api_scaled)[:, 1]
y_api = pd.read_csv('y_api.csv')

# Plage de seuils à tester
seuils = [i / 100 for i in range(10, 90, 5)]  # seuils de 0.1 à 0.9 par pas de 0.05

# Variables pour stocker le meilleur seuil et le meilleur score métier
meilleur_seuil = 0.5
meilleur_cost = float("inf")  # Plus bas est le coût, meilleur est le résultat

for seuil in seuils:
    y_pred = (y_prob_final_lgbm >= seuil).astype(int)
    
    # Calcul du coût métier pour ce seuil
    tn, fp, fn, tp = confusion_matrix(y_api, y_pred).ravel()
    cost = fp + 10 * fn  # "Business Score" avec faux positifs et faux négatifs pondérés
    
    if cost < meilleur_cost:
        meilleur_cost = cost
        meilleur_seuil = seuil

print(f"Meilleur seuil pour le Business Score : {meilleur_seuil}")
print(f"Business Score optimal : {meilleur_cost}")



Meilleur seuil pour le Business Score : 0.45
Business Score optimal : 2062


In [5]:
# Prédictions finales avec le seuil optimal
y_pred_final_lgbm = (y_prob_final_lgbm >= meilleur_seuil).astype(int)

# Calcul des métriques finales
final_auc = roc_auc_score(y_api, y_prob_final_lgbm)
final_accuracy = accuracy_score(y_api, y_pred_final_lgbm)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_lgbm).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle lightgbm final avec le meilleur seuil :")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({
        "Final AUC lightgbm": final_auc, 
        "Final Accuracy lightgbm": final_accuracy, 
        "Final Business Score lightgbm": final_cost
    })


Métriques du modèle lightgbm final avec le meilleur seuil :
AUC: 0.6556
Accuracy: 0.5694
Business Score: 2062.0000


In [6]:
import optuna

# Fonction objective pour optimiser le seuil
def objective(trial):
    seuil = trial.suggest_float("seuil", 0.0, 1.0)
    y_pred = (y_prob_final_lgbm >= seuil).astype(int)
    
    tn, fp, fn, tp = confusion_matrix(y_api, y_pred).ravel()
    cost = fp + 10 * fn  # Business Score
    
    return cost

# Exécution de l'optimisation
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Résultats
meilleur_seuil = study.best_params['seuil']
meilleur_cost = study.best_value

print(f"Seuil optimal trouvé avec Optuna : {meilleur_seuil}")
print(f"Business Score optimal : {meilleur_cost}")


Seuil optimal trouvé avec Optuna : 0.4603532551603117
Business Score optimal : 2000.0


In [7]:
# Prédictions finales avec le seuil optimal
y_pred_final_lgbm = (y_prob_final_lgbm >= meilleur_seuil).astype(int)

# Calcul des métriques finales
final_auc = roc_auc_score(y_api, y_prob_final_lgbm)
final_accuracy = accuracy_score(y_api, y_pred_final_lgbm)
tn, fp, fn, tp = confusion_matrix(y_api, y_pred_final_lgbm).ravel()
final_cost = fp + 10 * fn  

print("Métriques du modèle lightgbm final avec le meilleur seuil :")
print(f"AUC: {final_auc:.4f}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Business Score: {final_cost:.4f}")

# Enregistrement des métriques finales dans MLflow
with mlflow.start_run():
    mlflow.log_metrics({
        "Final AUC lightgbm": final_auc, 
        "Final Accuracy lightgbm": final_accuracy, 
        "Final Business Score lightgbm": final_cost
    })

Métriques du modèle lightgbm final avec le meilleur seuil :
AUC: 0.6556
Accuracy: 0.6364
Business Score: 2000.0000
