In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import scipy.stats as stats
import pickle

from sklearn.model_selection import RandomizedSearchCV, train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score,make_scorer
from scipy.stats import randint, uniform, loguniform
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import label_binarize
from lightgbm import LGBMClassifier,early_stopping, log_evaluation
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [2]:
df_train = pd.read_csv(r"../data\OUT\diabetes_clean.csv")

In [3]:
df_train.columns

Index(['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

# Split Train-Val-Test

In [4]:
X = df_train.drop('Outcome', axis=1)
y = df_train['Outcome']

# Dividir el dataset en entrenamiento y prueba
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, stratify=y_val, test_size=0.5, random_state=42)

In [5]:
def dataframe_metrics(model, X_val, y_val, model_name=None, df_metrics=None):
    # Prédire sur le jeu de validation
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1_class1 = f1_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, average='weighted')


    # Créer un DataFrame pour les résultats
    results = pd.DataFrame([{
        'Model': model_name,  # Utiliser le nom du modèle
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score class 1': f1_class1,
        'F1 Score global weighted': f1,
    }])

    # Ajouter les résultats au DataFrame existant (en utilisant pd.concat)
    if df_metrics is None:
        df_metrics = results
    else:
        df_metrics = pd.concat([df_metrics, results], ignore_index=True)

    return df_metrics


# Train without balancing

In [6]:
df_metrics_val= pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score class 1', 'F1 Score global weighted'])

In [7]:
# Autre modèle, par exemple un modèle LogisticRegression
logistic_reg = LogisticRegression(max_iter=1000, random_state=42)
logistic_reg.fit(X_train, y_train)

# Appeler à nouveau la fonction pour ajouter les métriques du modèle LogisticRegression
df_metrics_val = dataframe_metrics(logistic_reg,X_val, y_val, model_name="logistic_reg", df_metrics=df_metrics_val)
print(df_metrics_val)


          Model  Accuracy  Precision    Recall  F1 Score class 1  \
0  logistic_reg  0.758621   0.755808  0.758621          0.588235   

   F1 Score global weighted  
0                  0.744076  


  df_metrics = pd.concat([df_metrics, results], ignore_index=True)


In [8]:
# Premier modèle : Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Appeler la fonction pour ajouter les métriques du premier modèle
df_metrics_val = dataframe_metrics(decision_tree,X_val, y_val, model_name="decision_tree",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  


In [9]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(random_forest,X_val, y_val, model_name="random_forest",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  


In [10]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(adaboost, X_val, y_val, model_name="adaboost",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  


In [11]:
xgboost = XGBClassifier(eval_metric='logloss')
xgboost.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(xgboost,X_val, y_val, model_name="xgboost",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   
4        xgboost  0.732759   0.724561  0.732759          0.575342   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  
4                  0.723848  


In [12]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(lgbm, X_val, y_val, model_name="lgbm",df_metrics=df_metrics_val)
print(df_metrics_val)

[LightGBM] [Info] Number of positive: 187, number of negative: 350
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Number of data points in the train set: 537, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348231 -> initscore=-0.626825
[LightGBM] [Info] Start training from score -0.626825
           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   
4        xgboost  0.732759   0.724561  0.732759          0.575342   
5           lgbm  0.724138   0.71567

In [13]:
linear_svm = SVC(kernel='linear', probability=True)
linear_svm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(linear_svm, X_val, y_val, model_name="linear_svm",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   
4        xgboost  0.732759   0.724561  0.732759          0.575342   
5           lgbm  0.724138   0.715678  0.724138          0.567568   
6     linear_svm  0.750000   0.746795  0.750000          0.567164   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  
4                  0.723848  
5                  0.716210  
6                  0.733379  


In [14]:
svm = SVC(probability=True)
svm.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(svm,X_val, y_val, model_name="svm",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   
4        xgboost  0.732759   0.724561  0.732759          0.575342   
5           lgbm  0.724138   0.715678  0.724138          0.567568   
6     linear_svm  0.750000   0.746795  0.750000          0.567164   
7            svm  0.758621   0.758863  0.758621          0.575758   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  
4                  0.723848  
5                  0.716210  
6                  0.733379  
7                  0.740995  


In [15]:
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(KNN,X_val, y_val, model_name="KNN",df_metrics=df_metrics_val)
print(df_metrics_val)

           Model  Accuracy  Precision    Recall  F1 Score class 1  \
0   logistic_reg  0.758621   0.755808  0.758621          0.588235   
1  decision_tree  0.655172   0.644736  0.655172          0.473684   
2  random_forest  0.732759   0.724754  0.732759          0.550725   
3       adaboost  0.793103   0.790879  0.793103          0.666667   
4        xgboost  0.732759   0.724561  0.732759          0.575342   
5           lgbm  0.724138   0.715678  0.724138          0.567568   
6     linear_svm  0.750000   0.746795  0.750000          0.567164   
7            svm  0.758621   0.758863  0.758621          0.575758   
8            KNN  0.715517   0.704783  0.715517          0.521739   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  
4                  0.723848  
5                  0.716210  
6                  0.733379  
7                  0.740995  
8                  0.700063  


In [16]:
df_metrics_val= df_metrics_val.reset_index(drop=True)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
3,adaboost,0.793103,0.790879,0.793103,0.666667,0.785201
0,logistic_reg,0.758621,0.755808,0.758621,0.588235,0.744076
7,svm,0.758621,0.758863,0.758621,0.575758,0.740995
4,xgboost,0.732759,0.724561,0.732759,0.575342,0.723848
5,lgbm,0.724138,0.715678,0.724138,0.567568,0.71621
6,linear_svm,0.75,0.746795,0.75,0.567164,0.733379
2,random_forest,0.732759,0.724754,0.732759,0.550725,0.718241
8,KNN,0.715517,0.704783,0.715517,0.521739,0.700063
1,decision_tree,0.655172,0.644736,0.655172,0.473684,0.648192


# Train balanced

In [17]:
# Entraîner une régression logistique avec des poids équilibrés
logistic_reg_balanced = LogisticRegression(class_weight='balanced')
logistic_reg_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(logistic_reg_balanced,X_val, y_val, model_name="logistic_reg_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                   Model  Accuracy  Precision    Recall  F1 Score class 1  \
0           logistic_reg  0.758621   0.755808  0.758621          0.588235   
1          decision_tree  0.655172   0.644736  0.655172          0.473684   
2          random_forest  0.732759   0.724754  0.732759          0.550725   
3               adaboost  0.793103   0.790879  0.793103          0.666667   
4                xgboost  0.732759   0.724561  0.732759          0.575342   
5                   lgbm  0.724138   0.715678  0.724138          0.567568   
6             linear_svm  0.750000   0.746795  0.750000          0.567164   
7                    svm  0.758621   0.758863  0.758621          0.575758   
8                    KNN  0.715517   0.704783  0.715517          0.521739   
9  logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   

   F1 Score global weighted  
0                  0.744076  
1                  0.648192  
2                  0.718241  
3                  0.785201  
4 

In [18]:
decision_tree_balanced = DecisionTreeClassifier(class_weight='balanced')
decision_tree_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(decision_tree_balanced,X_val, y_val, model_name="decision_tree_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   

    F1 Score global weighted  
0                   

In [19]:
random_forest_balanced = RandomForestClassifier(class_weight='balanced')

# Entraîner le modèle sur les données d'entraînement
random_forest_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(random_forest_balanced,X_val, y_val, model_name="random_forest_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   
11  random_forest_balanced  0.741379   0.734245  0.7

In [20]:
scale_pos_weight = len(y_train) / (2 * np.bincount(y_train)[1])

# Créer un modèle XGBoost avec ajustement des poids de classe
params = {
    'objective': 'binary:logistic',  # Classification binaire
    'eval_metric': 'logloss',        # Fonction de perte
    'scale_pos_weight': scale_pos_weight # Ajustement du poids de la classe positive
}

xgboost_balanced = XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight)
xgboost_balanced.fit(X_train, y_train)


df_metrics_val = dataframe_metrics(xgboost_balanced,X_val, y_val, model_name="xgboost_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   
11  random_forest_balanced  0.741379   0.734245  0.7

In [21]:
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

adaboost_balanced = AdaBoostClassifier(n_estimators=50)

# Entraîner le modèle avec les poids des échantillons
adaboost_balanced.fit(X_train, y_train, sample_weight=sample_weight)

df_metrics_val = dataframe_metrics(adaboost_balanced,X_val, y_val, model_name="adaboost_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   
11  random_forest_balanced  0.741379   0.734245  0.7

In [22]:
lgbm_balanced = LGBMClassifier(class_weight='balanced', n_estimators=100)

# Entraîner le modèle
lgbm_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(lgbm_balanced,X_val, y_val, model_name="lgbm_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

[LightGBM] [Info] Number of positive: 187, number of negative: 350


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Number of data points in the train set: 537, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0

In [23]:
svm_balanced = SVC(class_weight='balanced')

# Entraîner le modèle
svm_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(svm_balanced,X_val, y_val, model_name="svm_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   
11  random_forest_balanced  0.741379   0.734245  0.7

In [24]:
knn_balanced = KNeighborsClassifier(weights='distance')

# Entraîner le modèle
knn_balanced.fit(X_train, y_train)

df_metrics_val = dataframe_metrics(knn_balanced,X_val, y_val, model_name="knn_balanced",df_metrics=df_metrics_val)
print(df_metrics_val)

                     Model  Accuracy  Precision    Recall  F1 Score class 1  \
0             logistic_reg  0.758621   0.755808  0.758621          0.588235   
1            decision_tree  0.655172   0.644736  0.655172          0.473684   
2            random_forest  0.732759   0.724754  0.732759          0.550725   
3                 adaboost  0.793103   0.790879  0.793103          0.666667   
4                  xgboost  0.732759   0.724561  0.732759          0.575342   
5                     lgbm  0.724138   0.715678  0.724138          0.567568   
6               linear_svm  0.750000   0.746795  0.750000          0.567164   
7                      svm  0.758621   0.758863  0.758621          0.575758   
8                      KNN  0.715517   0.704783  0.715517          0.521739   
9    logistic_reg_balanced  0.784483   0.783371  0.784483          0.691358   
10  decision_tree_balanced  0.646552   0.627807  0.646552          0.422535   
11  random_forest_balanced  0.741379   0.734245  0.7

In [25]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
9,logistic_reg_balanced,0.784483,0.783371,0.784483,0.691358,0.783866
13,adaboost_balanced,0.767241,0.771748,0.767241,0.682353,0.768974
3,adaboost,0.793103,0.790879,0.793103,0.666667,0.785201
15,svm_balanced,0.724138,0.739935,0.724138,0.644444,0.728628
0,logistic_reg,0.758621,0.755808,0.758621,0.588235,0.744076
7,svm,0.758621,0.758863,0.758621,0.575758,0.740995
4,xgboost,0.732759,0.724561,0.732759,0.575342,0.723848
11,random_forest_balanced,0.741379,0.734245,0.741379,0.571429,0.72879
5,lgbm,0.724138,0.715678,0.724138,0.567568,0.71621
12,xgboost_balanced,0.724138,0.715678,0.724138,0.567568,0.71621


# Train with the best weights

In [26]:
weights_range = np.arange(4.0, 10, 0.001)  # précision très fine

best_model = None
best_f1_score = -np.inf
best_weights = None

for weight in weights_range:
    class_weights = {0: 1, 1: weight}
    logreg = LogisticRegression(class_weight=class_weights, solver='liblinear', max_iter=1000, random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label=1)

    print(f"Poids 1: {weight:.3f} - F1 Score: {f1:.4f}")

    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = logreg
        best_weights = class_weights

print("\n🏆 Meilleur modèle ultra-précis :")
print(f"F1 Score : {best_f1_score:.4f}")
print(f"Poids optimaux : {best_weights}")

df_metrics_val = dataframe_metrics(best_model, X_val, y_val, model_name="logreg_best_weights", df_metrics=df_metrics_val)
print(df_metrics_val)


Poids 1: 4.000 - F1 Score: 0.7273
Poids 1: 4.001 - F1 Score: 0.7273
Poids 1: 4.002 - F1 Score: 0.7273
Poids 1: 4.003 - F1 Score: 0.7273
Poids 1: 4.004 - F1 Score: 0.7273
Poids 1: 4.005 - F1 Score: 0.7273
Poids 1: 4.006 - F1 Score: 0.7273
Poids 1: 4.007 - F1 Score: 0.7273
Poids 1: 4.008 - F1 Score: 0.7273
Poids 1: 4.009 - F1 Score: 0.7273
Poids 1: 4.010 - F1 Score: 0.7273
Poids 1: 4.011 - F1 Score: 0.7273
Poids 1: 4.012 - F1 Score: 0.7273
Poids 1: 4.013 - F1 Score: 0.7273
Poids 1: 4.014 - F1 Score: 0.7273
Poids 1: 4.015 - F1 Score: 0.7273
Poids 1: 4.016 - F1 Score: 0.7273
Poids 1: 4.017 - F1 Score: 0.7273
Poids 1: 4.018 - F1 Score: 0.7273
Poids 1: 4.019 - F1 Score: 0.7273
Poids 1: 4.020 - F1 Score: 0.7273
Poids 1: 4.021 - F1 Score: 0.7273
Poids 1: 4.022 - F1 Score: 0.7273
Poids 1: 4.023 - F1 Score: 0.7273
Poids 1: 4.024 - F1 Score: 0.7273
Poids 1: 4.025 - F1 Score: 0.7273
Poids 1: 4.026 - F1 Score: 0.7273
Poids 1: 4.027 - F1 Score: 0.7273
Poids 1: 4.028 - F1 Score: 0.7273
Poids 1: 4.029

In [27]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
17,logreg_best_weights,0.767241,0.810196,0.767241,0.727273,0.77235
9,logistic_reg_balanced,0.784483,0.783371,0.784483,0.691358,0.783866
13,adaboost_balanced,0.767241,0.771748,0.767241,0.682353,0.768974
3,adaboost,0.793103,0.790879,0.793103,0.666667,0.785201
15,svm_balanced,0.724138,0.739935,0.724138,0.644444,0.728628
0,logistic_reg,0.758621,0.755808,0.758621,0.588235,0.744076
7,svm,0.758621,0.758863,0.758621,0.575758,0.740995
4,xgboost,0.732759,0.724561,0.732759,0.575342,0.723848
11,random_forest_balanced,0.741379,0.734245,0.741379,0.571429,0.72879
12,xgboost_balanced,0.724138,0.715678,0.724138,0.567568,0.71621


In [28]:

# Liste des poids à tester entre 9.0 et 11.0 avec un pas de 0.5
weights_range = np.arange(4, 10, 0.01)

best_model = None
best_f1_score = -np.inf
best_weights = None

for w in weights_range:
    class_weights = {0: 1, 1: w}
    sample_weight = compute_sample_weight(class_weight=class_weights, y=y_train)

    ada_model = AdaBoostClassifier(n_estimators=50, random_state=42, learning_rate=1.0)
    ada_model.fit(X_train, y_train, sample_weight=sample_weight)

    y_pred = ada_model.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label=1)

    print(f"Poids classe 1: {w:.1f} - F1 Score: {f1:.4f}")

    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = ada_model
        best_weights = class_weights

print("\n✅ Meilleur modèle AdaBoost avec poids entre 9 et 11 :")
print(f"F1 Score : {best_f1_score:.4f}")
print(f"Poids optimaux : {best_weights}")

# Évaluer et stocker les métriques du meilleur modèle
df_metrics_val = dataframe_metrics(best_model, X_val, y_val, model_name="adaboost_best_weights", df_metrics=df_metrics_val)
print(df_metrics_val)


Poids classe 1: 4.0 - F1 Score: 0.6875
Poids classe 1: 4.0 - F1 Score: 0.6875
Poids classe 1: 4.0 - F1 Score: 0.6875
Poids classe 1: 4.0 - F1 Score: 0.6869
Poids classe 1: 4.0 - F1 Score: 0.6869
Poids classe 1: 4.0 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6869
Poids classe 1: 4.1 - F1 Score: 0.6931
Poids classe 1: 4.1 - F1 Score: 0.6931
Poids classe 1: 4.1 - F1 Score: 0.6931
Poids classe 1: 4.1 - F1 Score: 0.6931
Poids classe 1: 4.1 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6869
Poids classe 1: 4.2 - F1 Score: 0.7000
Poids classe 1: 4.2 - F1 Score: 0.7000
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 Score: 0.6931
Poids classe 1: 4.2 - F1 

In [29]:
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
17,logreg_best_weights,0.767241,0.810196,0.767241,0.727273,0.77235
18,adaboost_best_weights,0.741379,0.788166,0.741379,0.7,0.747022
9,logistic_reg_balanced,0.784483,0.783371,0.784483,0.691358,0.783866
13,adaboost_balanced,0.767241,0.771748,0.767241,0.682353,0.768974
3,adaboost,0.793103,0.790879,0.793103,0.666667,0.785201
15,svm_balanced,0.724138,0.739935,0.724138,0.644444,0.728628
0,logistic_reg,0.758621,0.755808,0.758621,0.588235,0.744076
7,svm,0.758621,0.758863,0.758621,0.575758,0.740995
4,xgboost,0.732759,0.724561,0.732759,0.575342,0.723848
11,random_forest_balanced,0.741379,0.734245,0.741379,0.571429,0.72879


# Hypertuning

In [30]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [31]:
logreg = LogisticRegression(
    solver='saga',
    max_iter=2000,
    random_state=42,
    class_weight={0: 1, 1: 4}
)

# Espace des hyperparamètres
param_distributions = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]  # uniquement utilisé avec elasticnet
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_distributions,
    n_iter=50,
    cv=cv,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Résultats
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score (mean CV accuracy): ", random_search.best_score_)

logreg_best = random_search.best_estimator_
y_pred = logreg_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best set of hyperparameters:  {'C': np.float64(24.658329458549105), 'l1_ratio': 1.0, 'penalty': 'l1'}
Best score (mean CV accuracy):  0.6530204278384892
Validation accuracy: 0.7672413793103449
F1 score global (weighted): 0.7723501543828223
F1 score class 1: 0.7272727272727273




In [32]:

logreg = LogisticRegression(
    solver='saga',
    max_iter=2000,
    random_state=42,
    class_weight={0: 1, 1: 4.0}
)

# ⚠️ l1_ratio n’est pris en compte que si penalty='elasticnet'
# On doit donc faire un grid cohérent selon les pénalités

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': np.arange(22, 26, 0.1),
    'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]  # sera ignoré sauf si penalty='elasticnet'
}

# F1 Score (positif sur la classe 1 sous-représentée)
scorer = make_scorer(f1_score, pos_label=1)

grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=cv,
    scoring=scorer,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Résultats
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur F1 (classe 1) moyenne en CV :", grid_search.best_score_)

logreg_best = grid_search.best_estimator_
y_pred = logreg_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))


Fitting 5 folds for each of 600 candidates, totalling 3000 fits
Meilleurs hyperparamètres : {'C': np.float64(22.0), 'l1_ratio': 0.0, 'penalty': 'l1'}
Meilleur F1 (classe 1) moyenne en CV : 0.6530204278384892
Validation accuracy: 0.7672413793103449
F1 score global (weighted): 0.7723501543828223
F1 score class 1: 0.7272727272727273




In [33]:
df_metrics_val = dataframe_metrics(logreg_best, X_val, y_val, model_name="logreg_best", df_metrics=df_metrics_val)
df_metrics_val.sort_values(by='F1 Score class 1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score class 1,F1 Score global weighted
19,logreg_best,0.767241,0.810196,0.767241,0.727273,0.77235
17,logreg_best_weights,0.767241,0.810196,0.767241,0.727273,0.77235
18,adaboost_best_weights,0.741379,0.788166,0.741379,0.7,0.747022
9,logistic_reg_balanced,0.784483,0.783371,0.784483,0.691358,0.783866
13,adaboost_balanced,0.767241,0.771748,0.767241,0.682353,0.768974
3,adaboost,0.793103,0.790879,0.793103,0.666667,0.785201
15,svm_balanced,0.724138,0.739935,0.724138,0.644444,0.728628
0,logistic_reg,0.758621,0.755808,0.758621,0.588235,0.744076
7,svm,0.758621,0.758863,0.758621,0.575758,0.740995
4,xgboost,0.732759,0.724561,0.732759,0.575342,0.723848


# Adaboost

In [36]:
class_weights = {0: 1, 1: 4.02}  # Poids manuels pour chaque classe

# Calculer les poids d'échantillons à l'aide des poids de classe
sample_weight = compute_sample_weight(class_weight=class_weights, y=y_train)



# Définir les hyperparamètres à tester dans RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200, 300],        # Nombre d'estimateurs
    'learning_rate': [0.1, 0.5, 1.0, 1.5],      # Taux d'apprentissage
}

# Créer un modèle AdaBoost
ada_model = AdaBoostClassifier(random_state=42)

# Configurer RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=ada_model, 
    param_distributions=param_dist, 
    n_iter=10,               # Nombre d'itérations aléatoires
    scoring='accuracy',      # Mesure de la performance (ici, l'accuracy)
    cv=5,                    # Cross-validation avec 5 splits
    random_state=42,         # Fixer la graine pour la reproductibilité
    n_jobs=-1                # Utiliser tous les cœurs du CPU
)

# Effectuer la recherche aléatoire avec les poids d'échantillons
random_search.fit(X_train, y_train, sample_weight=sample_weight)


# Afficher les meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres :")
print(random_search.best_params_)

adaboost_best = random_search.best_estimator_
y_pred = adaboost_best.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("F1 score global (weighted):", f1_score(y_val, y_pred, average='weighted'))
print("F1 score class 1:", f1_score(y_val, y_pred, pos_label=1))

Meilleurs hyperparamètres :
{'n_estimators': 100, 'learning_rate': 1.0}
Validation accuracy: 0.7241379310344828
F1 score global (weighted): 0.7301567398119123
F1 score class 1: 0.68
