In [47]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import scipy.stats as stats
import pickle

from sklearn.model_selection import RandomizedSearchCV, train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score,make_scorer
from scipy.stats import randint, uniform, loguniform
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import label_binarize
from lightgbm import LGBMClassifier,early_stopping, log_evaluation
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [48]:
df_train = pd.read_csv(r"../data\OUT\diabetes_train.csv")
df_val = pd.read_csv(r"../data\OUT\diabetes_valid.csv")
df_test = pd.read_csv(r"../data\OUT\diabetes_test.csv")

In [49]:
df_train.columns

Index(['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

# Split Train-Val-Test

In [50]:
X_train = df_train.drop('Outcome', axis=1)
y_train = df_train['Outcome']

X_val = df_val.drop('Outcome', axis=1)
y_val = df_val['Outcome']

X_test = df_test.drop('Outcome', axis=1)
y_test = df_test['Outcome']

In [51]:
def dataframe_metrics(model, X_val, y_val, model_name=None, df_metrics=None):
    # Prédire sur le jeu de validation
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1_class1 = f1_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, average='weighted')


    # Créer un DataFrame pour les résultats
    results = pd.DataFrame([{
        'Model': model_name,  # Utiliser le nom du modèle
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score class 1': f1_class1,
        'F1 Score global weighted': f1,
    }])

    # Ajouter les résultats au DataFrame existant (en utilisant pd.concat)
    if df_metrics is None:
        df_metrics = results
    else:
        df_metrics = pd.concat([df_metrics, results], ignore_index=True)

    return df_metrics


# Train without balancing

In [52]:
df_metrics_val= pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score class 1', 'F1 Score global weighted'])

In [53]:
# Autre modèle, par exemple un modèle LogisticRegression
logistic_reg = LogisticRegression(max_iter=1000, random_state=42)
logistic_reg.fit(X_train, y_train)

# Appeler à nouveau la fonction pour ajouter les métriques du modèle LogisticRegression
df_metrics_val = dataframe_metrics(logistic_reg,X_val, y_val, model_name="logistic_reg", df_metrics=df_metrics_val)
print(df_metrics_val)


          Model  Accuracy  Precision    Recall  F1 Score class 1  \
0  logistic_reg  0.804878    0.80196  0.804878          0.707317   

   F1 Score global weighted  
0                  0.802499  


  df_metrics = pd.concat([df_metrics, results], ignore_index=True)


In [54]:
# Premier modèle : Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Appeler la fonction pour ajouter les métriques du premier modèle
df_metrics_val = dataframe_metrics(decision_tree,X_val, y_val, model_name="decision_tree",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  


In [55]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(random_forest,X_val, y_val, model_name="random_forest",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  


In [56]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(adaboost, X_val, y_val, model_name="adaboost",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  


In [57]:
xgboost = XGBClassifier(eval_metric='logloss')
xgboost.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(xgboost,X_val, y_val, model_name="xgboost",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   
4        xgboost  0.731707   0.727850  0.731707          0.602410   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  
4                  0.729328  


In [58]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(lgbm, X_val, y_val, model_name="lgbm",df_metrics=df_metrics_val)
print(df_metrics_val)

[LightGBM] [Info] Number of positive: 171, number of negative: 320
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 522
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348269 -> initscore=-0.626657
[LightGBM] [Info] Start training from score -0.626657
           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   
4        xgboost  0.731707   0.727850  0.731707          0.602410   
5           lgbm  0.739837   0.737260  0.739837          0.619048   

   F1 Score global weighted  


In [59]:
linear_svm = SVC(kernel='linear', probability=True)
linear_svm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(linear_svm, X_val, y_val, model_name="linear_svm",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   
4        xgboost  0.731707   0.727850  0.731707          0.602410   
5           lgbm  0.739837   0.737260  0.739837          0.619048   
6     linear_svm  0.829268   0.828688  0.829268          0.727273   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  
4                  0.729328  
5                  0.738346  
6                  0.823837  


In [60]:
svm = SVC(probability=True)
svm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(svm,X_val, y_val, model_name="svm",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   
4        xgboost  0.731707   0.727850  0.731707          0.602410   
5           lgbm  0.739837   0.737260  0.739837          0.619048   
6     linear_svm  0.829268   0.828688  0.829268          0.727273   
7            svm  0.780488   0.775918  0.780488          0.649351   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  
4                  0.729328  
5                  0.738346  
6                  0.823837  
7                  0.773504  


In [61]:
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(KNN,X_val, y_val, model_name="KNN",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.804878   0.801960  0.804878          0.707317   
1  decision_tree  0.715447   0.724255  0.715447          0.615385   
2  random_forest  0.780488   0.777579  0.780488          0.674699   
3       adaboost  0.764228   0.758613  0.764228          0.632911   
4        xgboost  0.731707   0.727850  0.731707          0.602410   
5           lgbm  0.739837   0.737260  0.739837          0.619048   
6     linear_svm  0.829268   0.828688  0.829268          0.727273   
7            svm  0.780488   0.775918  0.780488          0.649351   
8            KNN  0.723577   0.720801  0.723577          0.595238   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  
4                  0.729328  
5                  0.738346  
6                  0.823837  
7                  0.773504  
8                  0.721993  


In [62]:
df_metrics_val= df_metrics_val.reset_index(drop=True)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
2,random_forest,0.780488,0.777579,0.780488,0.674699,0.778541
7,svm,0.780488,0.775918,0.780488,0.649351,0.773504
3,adaboost,0.764228,0.758613,0.764228,0.632911,0.758723
5,lgbm,0.739837,0.73726,0.739837,0.619048,0.738346
1,decision_tree,0.715447,0.724255,0.715447,0.615385,0.718675
4,xgboost,0.731707,0.72785,0.731707,0.60241,0.729328
8,KNN,0.723577,0.720801,0.723577,0.595238,0.721993


# Train balanced

In [63]:
# Entraîner une régression logistique avec des poids équilibrés
logistic_reg_balanced = LogisticRegression(class_weight='balanced')
logistic_reg_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(logistic_reg_balanced,X_val, y_val, model_name="logistic_reg_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                   Model  Accuracy  Precision    Recall  F1 Score class 1  \
0           logistic_reg  0.804878   0.801960  0.804878          0.707317   
1          decision_tree  0.715447   0.724255  0.715447          0.615385   
2          random_forest  0.780488   0.777579  0.780488          0.674699   
3               adaboost  0.764228   0.758613  0.764228          0.632911   
4                xgboost  0.731707   0.727850  0.731707          0.602410   
5                   lgbm  0.739837   0.737260  0.739837          0.619048   
6             linear_svm  0.829268   0.828688  0.829268          0.727273   
7                    svm  0.780488   0.775918  0.780488          0.649351   
8                    KNN  0.723577   0.720801  0.723577          0.595238   
9  logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   

   F1 Score global weighted  
0                  0.802499  
1                  0.718675  
2                  0.778541  
3                  0.758723  
4 

In [64]:
decision_tree_balanced = DecisionTreeClassifier(class_weight='balanced')
decision_tree_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(decision_tree_balanced,X_val, y_val, model_name="decision_tree_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   

    F1 Score global weighted  
0                   

In [65]:
random_forest_balanced = RandomForestClassifier(class_weight='balanced')

# Entraîner le modèle sur les données d'entraînement
random_forest_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(random_forest_balanced,X_val, y_val, model_name="random_forest_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

In [66]:
scale_pos_weight = len(y_train) / (2 * np.bincount(y_train)[1])

# Créer un modèle XGBoost avec ajustement des poids de classe
params = {
    'objective': 'binary:logistic',  # Classification binaire
    'eval_metric': 'logloss',        # Fonction de perte
    'scale_pos_weight': scale_pos_weight # Ajustement du poids de la classe positive
}

xgboost_balanced = XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight)
xgboost_balanced.fit(X_train, y_train)


df_metrics_val = dataframe_metrics(xgboost_balanced,X_val, y_val, model_name="xgboost_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

In [67]:
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

adaboost_balanced = AdaBoostClassifier(n_estimators=50)

# Entraîner le modèle avec les poids des échantillons
adaboost_balanced.fit(X_train, y_train, sample_weight=sample_weight)

df_metrics_val = dataframe_metrics(adaboost_balanced,X_val, y_val, model_name="adaboost_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

In [68]:
lgbm_balanced = LGBMClassifier(class_weight='balanced', n_estimators=100)

# Entraîner le modèle
lgbm_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(lgbm_balanced,X_val, y_val, model_name="lgbm_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

[LightGBM] [Info] Number of positive: 171, number of negative: 320
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 522
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.7

In [69]:
svm_balanced = SVC(class_weight='balanced', probability=True)

# Entraîner le modèle
svm_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(svm_balanced,X_val, y_val, model_name="svm_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

In [70]:
knn_balanced = KNeighborsClassifier(weights='distance')

# Entraîner le modèle
knn_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(knn_balanced,X_val, y_val, model_name="knn_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

In [71]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
11,random_forest_balanced,0.780488,0.777579,0.780488,0.674699,0.778541
2,random_forest,0.780488,0.777579,0.780488,0.674699,0.778541
12,xgboost_balanced,0.756098,0.756098,0.756098,0.651163,0.756098
7,svm,0.780488,0.775918,0.780488,0.649351,0.773504
10,decision_tree_balanced,0.756098,0.751839,0.756098,0.634146,0.753123


# Train with the best weights

## Logistic regression

In [72]:
weights_range = np.arange(1, 3, 0.001)  # précision très fine

best_model = None
best_f1_score = -np.inf
best_weights = None

for weight in weights_range:
    class_weights = {0: 1, 1: weight}
    logreg = LogisticRegression(class_weight=class_weights, solver='liblinear', max_iter=1000, random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label=1)

    print(f"Poids 1: {weight:.3f} - F1 Score: {f1:.4f}")

    if f1 > best_f1_score:
        best_f1_score = f1
        logreg_best_weights = logreg
        best_weights = class_weights

print("\n🏆 Meilleur modèle ultra-précis :")
print(f"F1 Score : {best_f1_score:.4f}")
print(f"Poids optimaux : {best_weights}")

df_metrics_val = dataframe_metrics(logreg_best_weights, X_val, y_val, model_name="logreg_best_weights", df_metrics=df_metrics_val)
print(df_metrics_val)


Poids 1: 1.000 - F1 Score: 0.7073
Poids 1: 1.001 - F1 Score: 0.7073
Poids 1: 1.002 - F1 Score: 0.7073
Poids 1: 1.003 - F1 Score: 0.7073
Poids 1: 1.004 - F1 Score: 0.7073
Poids 1: 1.005 - F1 Score: 0.7073
Poids 1: 1.006 - F1 Score: 0.7073
Poids 1: 1.007 - F1 Score: 0.7073
Poids 1: 1.008 - F1 Score: 0.7073
Poids 1: 1.009 - F1 Score: 0.7073
Poids 1: 1.010 - F1 Score: 0.7073
Poids 1: 1.011 - F1 Score: 0.7073
Poids 1: 1.012 - F1 Score: 0.7073
Poids 1: 1.013 - F1 Score: 0.7073
Poids 1: 1.014 - F1 Score: 0.7073
Poids 1: 1.015 - F1 Score: 0.7073
Poids 1: 1.016 - F1 Score: 0.7073
Poids 1: 1.017 - F1 Score: 0.7073
Poids 1: 1.018 - F1 Score: 0.7073
Poids 1: 1.019 - F1 Score: 0.7073
Poids 1: 1.020 - F1 Score: 0.7073
Poids 1: 1.021 - F1 Score: 0.7073
Poids 1: 1.022 - F1 Score: 0.7073
Poids 1: 1.023 - F1 Score: 0.7073
Poids 1: 1.024 - F1 Score: 0.7073
Poids 1: 1.025 - F1 Score: 0.7073
Poids 1: 1.026 - F1 Score: 0.7073
Poids 1: 1.027 - F1 Score: 0.7073
Poids 1: 1.028 - F1 Score: 0.7073
Poids 1: 1.029

In [73]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
17,logreg_best_weights,0.821138,0.819552,0.821138,0.738095,0.820113
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
2,random_forest,0.780488,0.777579,0.780488,0.674699,0.778541
11,random_forest_balanced,0.780488,0.777579,0.780488,0.674699,0.778541
12,xgboost_balanced,0.756098,0.756098,0.756098,0.651163,0.756098
7,svm,0.780488,0.775918,0.780488,0.649351,0.773504


## Adaboost

In [74]:

# Liste des poids à tester entre 9.0 et 11.0 avec un pas de 0.5
weights_range = np.arange(2, 5, 0.01)

best_model = None
best_f1_score = -np.inf
best_weights = None

for w in weights_range:
    class_weights = {0: 1, 1: w}
    sample_weight = compute_sample_weight(class_weight=class_weights, y=y_train)

    ada_model = AdaBoostClassifier(n_estimators=50, random_state=42, learning_rate=1.0)
    ada_model.fit(X_train, y_train, sample_weight=sample_weight)

    y_pred = ada_model.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label=1)

    print(f"Poids classe 1: {w:.1f} - F1 Score: {f1:.4f}")

    if f1 > best_f1_score:
        best_f1_score = f1
        adaboost_best_weights = ada_model
        best_weights = class_weights

print("\n✅ Meilleur modèle AdaBoost :")
print(f"F1 Score : {best_f1_score:.4f}")
print(f"Poids optimaux : {best_weights}")

# Évaluer et stocker les métriques du meilleur modèle
df_metrics_val = dataframe_metrics(adaboost_best_weights, X_val, y_val, model_name="adaboost_best_weights", df_metrics=df_metrics_val)
print(df_metrics_val)


Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.0 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7010
Poids classe 1: 2.1 - F1 Score: 0.7071
Poids classe 1: 2.1 - F1 Score: 0.7071
Poids classe 1: 2.1 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 Score: 0.7071
Poids classe 1: 2.2 - F1 

In [75]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
17,logreg_best_weights,0.821138,0.819552,0.821138,0.738095,0.820113
18,adaboost_best_weights,0.772358,0.815734,0.772358,0.730769,0.77763
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
11,random_forest_balanced,0.780488,0.777579,0.780488,0.674699,0.778541
2,random_forest,0.780488,0.777579,0.780488,0.674699,0.778541
12,xgboost_balanced,0.756098,0.756098,0.756098,0.651163,0.756098


## SVM

In [76]:
weights_range = np.arange(1, 2, 0.01)  # précision très fine

best_model = None
best_f1_score = -np.inf
best_weights = None

for weight in weights_range:
    class_weights = {0: 1, 1: weight}
    svm = SVC(class_weight=class_weights, max_iter=10000, random_state=42, probability=True)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label=1)

    print(f"Poids 1: {weight:.3f} - F1 Score: {f1:.4f}")

    if f1 > best_f1_score:
        best_f1_score = f1
        svm_best_weights = svm
        best_weights = class_weights

print("\n🏆 Meilleur modèle SVM ultra-précis :")
print(f"F1 Score : {best_f1_score:.4f}")
print(f"Poids optimaux : {best_weights}")



Poids 1: 1.000 - F1 Score: 0.6494
Poids 1: 1.010 - F1 Score: 0.6494
Poids 1: 1.020 - F1 Score: 0.6494
Poids 1: 1.030 - F1 Score: 0.6494
Poids 1: 1.040 - F1 Score: 0.6494
Poids 1: 1.050 - F1 Score: 0.6494
Poids 1: 1.060 - F1 Score: 0.6667
Poids 1: 1.070 - F1 Score: 0.6667
Poids 1: 1.080 - F1 Score: 0.6667
Poids 1: 1.090 - F1 Score: 0.6667
Poids 1: 1.100 - F1 Score: 0.6667
Poids 1: 1.110 - F1 Score: 0.6667
Poids 1: 1.120 - F1 Score: 0.6914
Poids 1: 1.130 - F1 Score: 0.6988
Poids 1: 1.140 - F1 Score: 0.6988
Poids 1: 1.150 - F1 Score: 0.6988
Poids 1: 1.160 - F1 Score: 0.6988
Poids 1: 1.170 - F1 Score: 0.6988
Poids 1: 1.180 - F1 Score: 0.6988
Poids 1: 1.190 - F1 Score: 0.6988
Poids 1: 1.200 - F1 Score: 0.6824
Poids 1: 1.210 - F1 Score: 0.6824
Poids 1: 1.220 - F1 Score: 0.6824
Poids 1: 1.230 - F1 Score: 0.6824
Poids 1: 1.240 - F1 Score: 0.6824
Poids 1: 1.250 - F1 Score: 0.6824
Poids 1: 1.260 - F1 Score: 0.6824
Poids 1: 1.270 - F1 Score: 0.6824
Poids 1: 1.280 - F1 Score: 0.6824
Poids 1: 1.290

In [77]:
df_metrics_val = dataframe_metrics(svm_best_weights, X_val, y_val, model_name="svm_best_weights", df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.804878   0.801960  0.804878          0.707317   
1            decision_tree  0.715447   0.724255  0.715447          0.615385   
2            random_forest  0.780488   0.777579  0.780488          0.674699   
3                 adaboost  0.764228   0.758613  0.764228          0.632911   
4                  xgboost  0.731707   0.727850  0.731707          0.602410   
5                     lgbm  0.739837   0.737260  0.739837          0.619048   
6               linear_svm  0.829268   0.828688  0.829268          0.727273   
7                      svm  0.780488   0.775918  0.780488          0.649351   
8                      KNN  0.723577   0.720801  0.723577          0.595238   
9    logistic_reg_balanced  0.780488   0.792041  0.780488          0.709677   
10  decision_tree_balanced  0.756098   0.751839  0.756098          0.634146   
11  random_forest_balanced  0.780488   0.777579  0.7

# Hypertuning

In [78]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic regression

In [79]:
logreg = LogisticRegression(
    solver='saga',
    max_iter=2000,
    random_state=42,
    class_weight={0: 1, 1: 1.25}
)

# Espace des hyperparamètres
param_distributions = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l1', 'l2']
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_distributions,
    n_iter=50,
    cv=cv,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Résultats
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score (mean CV accuracy): ", random_search.best_score_)

logreg_best = random_search.best_estimator_
y_pred = logreg_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best set of hyperparameters:  {'C': np.float64(1.408146893930582), 'penalty': 'l1'}
Best score (mean CV accuracy):  0.6676080953546086
Validation accuracy: 0.8130081300813008
F1 score global (weighted): 0.8113495430171982
F1 score class 1: 0.7228915662650602


In [80]:

logreg = LogisticRegression(
    solver='saga',
    max_iter=2000,
    random_state=42,
    class_weight={0: 1, 1: 1.25}
)


param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.arange(1, 2, 0.01),

}

# F1 Score (positif sur la classe 1 sous-représentée)
scorer = make_scorer(f1_score, pos_label=1)

grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=cv,
    scoring=scorer,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Résultats
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur F1 (classe 1) moyenne en CV :", grid_search.best_score_)

logreg_best = grid_search.best_estimator_
y_pred = logreg_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Meilleurs hyperparamètres : {'C': np.float64(1.0), 'penalty': 'l1'}
Meilleur F1 (classe 1) moyenne en CV : 0.6676080953546086
Validation accuracy: 0.8130081300813008
F1 score global (weighted): 0.8113495430171982
F1 score class 1: 0.7228915662650602


In [81]:
df_metrics_val = dataframe_metrics(logreg_best, X_val, y_val, model_name="logreg_best", df_metrics=df_metrics_val)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
19,svm_best_weights,0.813008,0.819986,0.813008,0.747253,0.815129
17,logreg_best_weights,0.821138,0.819552,0.821138,0.738095,0.820113
18,adaboost_best_weights,0.772358,0.815734,0.772358,0.730769,0.77763
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
20,logreg_best,0.813008,0.810731,0.813008,0.722892,0.81135
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
11,random_forest_balanced,0.780488,0.777579,0.780488,0.674699,0.778541


## Adaboost

In [82]:
class_weights = {0: 1, 1: 2.97}  # Poids manuels pour chaque classe

# Calculer les poids d'échantillons à l'aide des poids de classe
sample_weight = compute_sample_weight(class_weight=class_weights, y=y_train)



# Définir les hyperparamètres à tester dans RandomizedSearchCV
param_dist = {
    'n_estimators': [400, 500, 600, 700, 800],        # Nombre d'estimateurs
    'learning_rate': [0.1, 0.5, 1.0, 1.5],      # Taux d'apprentissage
}

# Créer un modèle AdaBoost
ada_model = AdaBoostClassifier(random_state=42)

# Configurer RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=ada_model, 
    param_distributions=param_dist, 
    n_iter=30,               # Nombre d'itérations aléatoires
    scoring=scorer,      # Mesure de la performance (ici, l'accuracy)
    cv=cv,                    # Cross-validation avec 5 splits
    random_state=42,         # Fixer la graine pour la reproductibilité
    n_jobs=-1                # Utiliser tous les cœurs du CPU
)

# Effectuer la recherche aléatoire avec les poids d'échantillons
random_search.fit(X_train, y_train, sample_weight=sample_weight)


# Afficher les meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres :")
print(random_search.best_params_)

adaboost_best = random_search.best_estimator_
y_pred = adaboost_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))



Meilleurs hyperparamètres :
{'n_estimators': 500, 'learning_rate': 0.5}
Validation accuracy: 0.7398373983739838
F1 score global (weighted): 0.7457979970635286
F1 score class 1: 0.6981132075471698


In [83]:
class_weights = {0: 1, 1: 2.97}  # Poids manuels pour chaque classe

# Calculer les poids d'échantillons à l'aide des poids de classe
sample_weight = compute_sample_weight(class_weight=class_weights, y=y_train)

# Définir les hyperparamètres à tester dans GridSearchCV
param_grid = {
    'n_estimators': np.arange(400, 600, 25),        # Nombre d'estimateurs
    'learning_rate': np.arange(0.5, 1.5, 0.25),      # Taux d'apprentissage
}

# Créer un modèle AdaBoost
ada_model = AdaBoostClassifier(random_state=42)

# Configurer GridSearchCV
grid_search = GridSearchCV(
    estimator=ada_model, 
    param_grid=param_grid, 
    scoring=scorer,      # Mesure de la performance (ici, l'accuracy)
    cv=cv,                    # Cross-validation avec 5 splits
    n_jobs=-1,               # Utiliser tous les cœurs du CPU
)

# Effectuer la recherche en grille avec les poids d'échantillons
grid_search.fit(X_train, y_train, sample_weight=sample_weight)

# Résultats
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur F1 (classe 1) moyenne en CV :", grid_search.best_score_)

adaboost_best = grid_search.best_estimator_
y_pred = adaboost_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))

Meilleurs hyperparamètres : {'learning_rate': np.float64(0.5), 'n_estimators': np.int64(550)}
Meilleur F1 (classe 1) moyenne en CV : 0.6838950937801511
Validation accuracy: 0.7398373983739838
F1 score global (weighted): 0.7457979970635286
F1 score class 1: 0.6981132075471698


In [84]:
df_metrics_val = dataframe_metrics(adaboost_best, X_val, y_val, model_name="adaboost_best", df_metrics=df_metrics_val)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
19,svm_best_weights,0.813008,0.819986,0.813008,0.747253,0.815129
17,logreg_best_weights,0.821138,0.819552,0.821138,0.738095,0.820113
18,adaboost_best_weights,0.772358,0.815734,0.772358,0.730769,0.77763
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
20,logreg_best,0.813008,0.810731,0.813008,0.722892,0.81135
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499
21,adaboost_best,0.739837,0.790683,0.739837,0.698113,0.745798


## SVM

In [85]:
custom_weights = {0: 1, 1: 1.44}

# Définir le modèle de base
svm = SVC(class_weight=custom_weights, random_state=42, probability=True)

# Définir l'espace de recherche
param_dist = {
    'C': np.linspace(0.001, 100, 20),  # de 0.001 à 100
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']  # utilisé pour certains kernels
}

# Scorer basé sur le F1 score
f1_scorer = make_scorer(f1_score, pos_label=1)

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=30,  # augmente si tu veux une recherche plus exhaustive
    scoring=f1_scorer,
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Lancer la recherche
random_search.fit(X_train, y_train)

print("Meilleurs hyperparamètres :")
print(random_search.best_params_)

svm_best = random_search.best_estimator_
y_pred = svm_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Meilleurs hyperparamètres :
{'kernel': 'rbf', 'gamma': 'scale', 'C': np.float64(5.264105263157894)}
Validation accuracy: 0.7886178861788617
F1 score global (weighted): 0.7906052393857272
F1 score class 1: 0.7111111111111111


In [86]:
custom_weights = {0: 1, 1: 1.44}

# Définir le modèle de base
svm = SVC(class_weight=custom_weights, random_state=42, probability=True)

# Définir l'espace de recherche (plus restreint que RandomizedSearch, sinon c'est trop long)
param_grid = {
    'C':np.arange(4, 6, 0.1),  # de 0.001 à 100 avec un pas de 0.1
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']  # gamma n'est utile que pour 'rbf', mais pas grave ici
}

# Scorer basé sur le F1 score
f1_scorer = make_scorer(f1_score, pos_label=1)

# Grid Search
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,  # défini plus haut dans ton script
    verbose=1,
    n_jobs=-1
)

# Lancer la recherche
grid_search.fit(X_train, y_train)
svm_best = grid_search.best_estimator_
y_pred = svm_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Validation accuracy: 0.8048780487804879
F1 score global (weighted): 0.8058603946223581
F1 score class 1: 0.7272727272727273


In [87]:
df_metrics_val = dataframe_metrics(svm_best, X_val, y_val, model_name="svm_best", df_metrics=df_metrics_val)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
19,svm_best_weights,0.813008,0.819986,0.813008,0.747253,0.815129
17,logreg_best_weights,0.821138,0.819552,0.821138,0.738095,0.820113
18,adaboost_best_weights,0.772358,0.815734,0.772358,0.730769,0.77763
22,svm_best,0.804878,0.807282,0.804878,0.727273,0.80586
6,linear_svm,0.829268,0.828688,0.829268,0.727273,0.823837
15,svm_balanced,0.772358,0.808667,0.772358,0.72549,0.777565
20,logreg_best,0.813008,0.810731,0.813008,0.722892,0.81135
9,logistic_reg_balanced,0.780488,0.792041,0.780488,0.709677,0.783727
13,adaboost_balanced,0.772358,0.79105,0.772358,0.708333,0.776626
0,logistic_reg,0.804878,0.80196,0.804878,0.707317,0.802499


# Validar con el dataset de test

In [89]:
df_metrics_test= pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score class 1', 'F1 Score global weighted'])

In [90]:
models = {
    "adaboost_best": adaboost_best,
    "adaboost_best_weights": adaboost_best_weights,
    "adaboost_balanced": adaboost_balanced,
    "logreg_best": logreg_best,
    "logreg_best_weights": logreg_best_weights,
    "logistic_reg_balanced": logistic_reg_balanced,
    "svm_best": svm_best,
    "svm_best_weights": svm_best_weights,
    "svm_balanced": svm_balanced
}

for model_name, model in models.items():
    df_metrics_test = dataframe_metrics(model, X_test, y_test, model_name=model_name, df_metrics=df_metrics_test)
    print(df_metrics_test)
df_metrics_test.sort_values(by='F1 Score class 1', ascending=False)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0  adaboost_best  0.766234   0.802387  0.766234           0.71875   

   F1 Score global weighted  
0                   0.77151  
                   Model  Accuracy  Precision    Recall  F1 Score class 1  \
0          adaboost_best  0.766234   0.802387  0.766234           0.71875   
1  adaboost_best_weights  0.740260   0.776676  0.740260           0.68750   

   F1 Score global weighted  
0                  0.771510  
1                  0.746122  
                   Model  Accuracy  Precision    Recall  F1 Score class 1  \
0          adaboost_best  0.766234   0.802387  0.766234          0.718750   
1  adaboost_best_weights  0.740260   0.776676  0.740260          0.687500   
2      adaboost_balanced  0.753247   0.755500  0.753247          0.654545   

   F1 Score global weighted  
0                  0.771510  
1                  0.746122  
2                  0.754244  
                   Model  Accuracy  Precision    R

  df_metrics = pd.concat([df_metrics, results], ignore_index=True)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
0,adaboost_best,0.766234,0.802387,0.766234,0.71875,0.77151
1,adaboost_best_weights,0.74026,0.776676,0.74026,0.6875,0.746122
5,logistic_reg_balanced,0.733766,0.750299,0.733766,0.655462,0.738324
2,adaboost_balanced,0.753247,0.7555,0.753247,0.654545,0.754244
8,svm_balanced,0.688312,0.707792,0.688312,0.6,0.693949
3,logreg_best,0.714286,0.711969,0.714286,0.584906,0.713005
4,logreg_best_weights,0.707792,0.706576,0.707792,0.579439,0.707154
7,svm_best_weights,0.681818,0.689157,0.681818,0.566372,0.684778
6,svm_best,0.701299,0.694621,0.701299,0.54902,0.696863


In [94]:
from sklearn.metrics import roc_auc_score
import pandas as pd

# Création du DataFrame pour stocker les résultats
df_auc = pd.DataFrame(columns=['Model', 'AUC'])

# === ADA BOOST ===
adaboost_models = [adaboost_best, adaboost_best_weights, adaboost_balanced]
adaboost_names = ["AdaBoost - best", "AdaBoost - best_weights", "AdaBoost - balanced"]

for name, model in zip(adaboost_names, adaboost_models):
    y_probs_ada = model.predict_proba(X_test)[:, 1]
    auc_ada = roc_auc_score(y_test, y_probs_ada)
    print(f"🟠 {name} - AUC: {auc_ada:.4f}")
    # Ajout des résultats dans le DataFrame
    df_auc = pd.concat([df_auc, pd.DataFrame({'Model': [name], 'AUC': [auc_ada]})], ignore_index=True)

# === REGRESSION LOGISTIQUE ===
regression_logistic_models = [logreg_best, logreg_best_weights, logistic_reg_balanced]
logreg_names = ["LogReg - best", "LogReg - best_weights", "LogReg - balanced"]

for name, model in zip(logreg_names, regression_logistic_models):
    y_probs_logreg = model.predict_proba(X_test)[:, 1]
    auc_logreg = roc_auc_score(y_test, y_probs_logreg)
    print(f"🔺 {name} - AUC: {auc_logreg:.4f}")
    # Ajout des résultats dans le DataFrame
    df_auc = pd.concat([df_auc, pd.DataFrame({'Model': [name], 'AUC': [auc_logreg]})], ignore_index=True)

# === SVM ===
svm_models = [svm_best, svm_best_weights, svm_balanced]
svm_names = ["SVM - best", "SVM - best_weights", "SVM - balanced"]

for name, model in zip(svm_names, svm_models):
    y_probs_svm = model.predict_proba(X_test)[:, 1]
    auc_svm = roc_auc_score(y_test, y_probs_svm)
    print(f"🔷 {name} - AUC: {auc_svm:.4f}")
    # Ajout des résultats dans le DataFrame
    df_auc = pd.concat([df_auc, pd.DataFrame({'Model': [name], 'AUC': [auc_svm]})], ignore_index=True)

# Affichage du DataFrame complet et trié par AUC
print("\n🏆 Résultats AUC :")
df_auc_sorted = df_auc.sort_values(by='AUC', ascending=False)
print(df_auc_sorted)


🟠 AdaBoost - best - AUC: 0.8284
🟠 AdaBoost - best_weights - AUC: 0.8138
🟠 AdaBoost - balanced - AUC: 0.8286
🔺 LogReg - best - AUC: 0.8056
🔺 LogReg - best_weights - AUC: 0.8059
🔺 LogReg - balanced - AUC: 0.8065
🔷 SVM - best - AUC: 0.8009
🔷 SVM - best_weights - AUC: 0.7880
🔷 SVM - balanced - AUC: 0.7784

🏆 Résultats AUC :
                     Model       AUC
2      AdaBoost - balanced  0.828611
0          AdaBoost - best  0.828426
1  AdaBoost - best_weights  0.813796
5        LogReg - balanced  0.806481
4    LogReg - best_weights  0.805926
3            LogReg - best  0.805556
6               SVM - best  0.800926
7       SVM - best_weights  0.787963
8           SVM - balanced  0.778426


  df_auc = pd.concat([df_auc, pd.DataFrame({'Model': [name], 'AUC': [auc_ada]})], ignore_index=True)
