# Librairies

In [1]:
%load_ext autoreload
%autoreload 2

# Importations de bibliothèques standard
import os
import sys
import warnings

# Configuration du chemin pour les importations
sys.path.append("../")

import pandas as pd 
import numpy as np


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from imblearn.pipeline import Pipeline as imPipeline
from sklearn.metrics import classification_report, confusion_matrix, auc, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

from tqdm import tqdm

from utils.functions import *
import joblib


# Dossier pour sauvegarder les checkpoints
checkpoint_dir = "../models/checkpoints/"

# Créer le dossier si non existant
os.makedirs(checkpoint_dir, existant=True)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
import importlib
import utils.functions 
importlib.reload(utils.functions)

from utils.functions import *  # Cela mettra à jour les définitions de fonctions



# 1 - Les données

**Données** : "Credit Card Fraud Detection"<br> 
**Dimensions des données** : L'ensemble de données comprend 284,807 lignes et 31 colonnes. Les variables sont les suivantes : 

- **Time** : Nombre de secondes écoulées depuis la première transaction dans l'ensemble de données.
- **V1, V2, ..., V28** : Caractéristiques techniques résultant d'une ACP (Analyse en Composantes Principales) pour protéger la confidentialité des données.
- **Amount** : Montant de la transaction.
- **Class** : Classe cible où 1 représente une transaction frauduleuse et 0 une transaction non frauduleuse.


In [33]:
# Importation de la données
data = pd.read_csv("../data/creditcard.csv.gz", compression="gzip")
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## 1.1 Préparation des données pour la modélisation.
### 1.1.1 Robust scaler sur l'Amount
En cas de présence de valeurs aberrantes, l'utilisation du RobustScaler permet de réduire leurs impacts sur la normalisation.

In [4]:
X = data.copy().drop(columns="Time")
y = X.pop("Class").to_numpy()
rs = RobustScaler().fit(X[['Amount']])
X[['Amount']] = rs.transform(X[['Amount']])

## 1.1.2 - Séparation des données en données d'apprentissage et données de test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42, stratify=y)

In [6]:
print("repartition des transactions dans les données d'entrainements")
display(pd.Series(y_train).value_counts(normalize=True))
print("repartition des transactions dans les données de test")
pd.Series(y_test).value_counts(normalize=True)

repartition des transactions dans les données d'entrainements


0    0.998271
1    0.001729
Name: proportion, dtype: float64

repartition des transactions dans les données de test


0    0.99828
1    0.00172
Name: proportion, dtype: float64

La proportion des transactions frauduleuses ou non est respectée dans les données d'entrainement et les données de test.

# 2 - Modélisation avec la Regression Logistique
## 2.1  1er modele, Regression Logistique



In [7]:
RL = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced")
RL.fit(X_train, y_train)
save_final_model("../models/models_fitted/", "RL_01", {"RL_01" : RL})

Models saved in ../models/models_fitted/RL_01.pkl


## 2.2 Variation des poids de la classe minoritaire et effet sur le modele.

In [8]:
LR_with_weight_fitted = {}
weights=[2,3,4,5,7,10,15,20,50,100,500]

for p in weights : 
    class_weight = {0:1, 1:p}
    RL = LogisticRegression(random_state=42, max_iter=1000, class_weight=class_weight)
    RL.fit(X_train, y_train)
    LR_with_weight_fitted["LR_{}".format(class_weight)] = RL

save_final_model("../models/models_fitted/", "LR_with_weight_fitted", LR_with_weight_fitted)

Models saved in ../models/models_fitted/LR_with_weight_fitted.pkl


# 3 - Autres modeles robustes aux données déséquilibrées.

In [9]:
# 
models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0,random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "RandomForest": RandomForestClassifier(random_state=42),
    "BalancedRandomForest": BalancedRandomForestClassifier(random_state=42),
    "EasyEnsemble": EasyEnsembleClassifier(random_state=42)
}

# Définir les grilles de paramètres
param_grids = {
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
            },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
            },
    "XGBoost": {
        'n_estimators': [100, 200,500],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 6, 10]
                },
    "CatBoost": {
        'iterations': [100, 200,500],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 6, 10]
                },
    "LightGBM": {
        'n_estimators': [100, 200,500],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 6, 10]
                },
    "RandomForest": {
        'n_estimators': [100, 200,500],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
                },
    "BalancedRandomForest": {
        'n_estimators': [100, 200,500],
        'max_depth': [None, 10, 20]
                },
    "EasyEnsemble": {
        'n_estimators': [50, 100,500]
                }
}

In [10]:
model_name_in_folder = "various_models_fitted_with_best_parameters"
models_fitted, results = load_checkpoint_model(checkpoint_dir=checkpoint_dir, model_name_in_folder=model_name_in_folder)


for model_name, model in tqdm(models.items()):
    if model_name in models_fitted:
        print(f"{model_name} has already been trained. Skipping.")
        # Charger le modèle entraîné depuis le dictionnaire
        continue  # On passe au modèle suivant
    
    print(f"Searching for best parameters for {model_name}...") 
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, n_jobs=-1, verbose=1, scoring="f1")
    grid_search.fit(X_train, y_train)
    results[model_name] = grid_search.best_params_, grid_search.best_score_
    models_fitted[model_name] = grid_search.best_estimator_ 

    save_checkpoint(checkpoint_dir, model_name_in_folder, models_fitted, results)
print("End")
# Sauvegarde finale du dictionnaire des modèles
save_final_model("../models/models_fitted/", model_name_in_folder, models_fitted)

Checkpoint loaded from ../models/checkpoints/various_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/various_models_fitted_with_best_parameters_results.pkl


100%|██████████| 8/8 [00:00<00:00, 59283.45it/s]

KNN has already been trained. Skipping.
SVM has already been trained. Skipping.
XGBoost has already been trained. Skipping.
CatBoost has already been trained. Skipping.
LightGBM has already been trained. Skipping.
RandomForest has already been trained. Skipping.
BalancedRandomForest has already been trained. Skipping.
EasyEnsemble has already been trained. Skipping.
End





Models saved in ../models/models_fitted/various_models_fitted_with_best_parameters.pkl


In [11]:
# Affichage des résultats
for model_name, (best_params, best_score) in results.items():
    print(f"{model_name}: Best Params: {best_params}, Best Score: {best_score:.4f}")

KNN: Best Params: {'n_neighbors': 3, 'weights': 'distance'}, Best Score: 0.8430
SVM: Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}, Best Score: 0.8285
XGBoost: Best Params: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 500}, Best Score: 0.8647
CatBoost: Best Params: {'depth': 6, 'iterations': 500, 'learning_rate': 0.2}, Best Score: 0.8652
LightGBM: Best Params: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500}, Best Score: 0.8443
RandomForest: Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}, Best Score: 0.8435
BalancedRandomForest: Best Params: {'max_depth': 10, 'n_estimators': 200}, Best Score: 0.1132
EasyEnsemble: Best Params: {'n_estimators': 50}, Best Score: 0.0901


# 4 Techniques d'Ensemble learning


In [12]:
# Charger les modèles
Various_models = load_final_model("../models/models_fitted/", 'various_models_fitted_with_best_parameters')

# Calculer les métriques
models_pr_dict_VM = {}
for model_name, model in Various_models.items(): 
    models_pr_dict_VM[model_name] = calculate_metrics(model, X_test, y_test)

# 1. Trier les modèles par AUC-PR puis par F1
sorted_models = sorted(
    models_pr_dict_VM.items(),
    key=lambda item: (-item[1]['auc_pr'], -item[1]['f1'])
)

# 2. Prendre les 4 meilleurs modèles
top_models_metrics = dict(sorted_models[:4])  # Dictionnaire des métriques des 4 meilleurs
top_model_names = list(top_models_metrics.keys())  # Liste des noms des 4 meilleurs modèles

# 3. Récupérer les modèles correspondants depuis Various_models
top_models = {name: Various_models[name] for name in top_model_names}

# 4. Prendre le 5ème meilleur modèle comme final_estimator
final_estimator_name = sorted_models[4][0]  # Le nom du 5ème modèle
final_estimator = Various_models[final_estimator_name]  # Récupérer le modèle lui-même

# Modèles d'ensemble avec les meilleurs modèles
voting_model = VotingClassifier(
    estimators=[(name, model) for name, model in top_models.items()],
    voting='soft'
)

stacking_model = StackingClassifier(
    estimators=[(name, model) for name, model in top_models.items()],
    final_estimator=final_estimator  # Utiliser le 5e meilleur modèle
)

# Dictionnaire des modèles d'ensemble
ensemble_models = {
    'Voting': voting_model,
    'Stacking': stacking_model
}


Final models loaded from ../models/models_fitted/various_models_fitted_with_best_parameters.pkl 




In [13]:
model_name_in_folder = "ensemble_models_fitted_with_best_parameters"
ensemble_models_fitted, models_pr_dict_ensemble = load_checkpoint_model(checkpoint_dir=checkpoint_dir, model_name_in_folder=model_name_in_folder)

for model_name, model in ensemble_models.items():
    if model_name in models_pr_dict_ensemble:
        print(f"{model_name} has already been trained. Skipping.")
        continue
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    models_pr_dict_ensemble[model_name] = calculate_metrics(model, X_test, y_test)
    ensemble_models_fitted[model_name] = model
    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))

    # Sauvegarder les checkpoints après chaque modèle
    save_checkpoint(checkpoint_dir, model_name_in_folder, ensemble_models_fitted, models_pr_dict_ensemble)

# Sauvegarde finale du dictionnaire des modèles
save_final_model("../models/models_fitted/", model_name_in_folder, ensemble_models_fitted)

Checkpoint loaded from ../models/checkpoints/ensemble_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/ensemble_models_fitted_with_best_parameters_results.pkl
Voting has already been trained. Skipping.
Stacking has already been trained. Skipping.
Models saved in ../models/models_fitted/ensemble_models_fitted_with_best_parameters.pkl


# 5 - Réchantillonnage

## 5.1 - Undersampling

In [14]:
rus = RandomUnderSampler(sampling_strategy=0.1, random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
pd.Series(y_rus).value_counts()

0    4920
1     492
Name: count, dtype: int64

In [15]:
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus,y_rus, train_size=0.7, random_state=42, stratify=y_rus)
print(pd.Series(y_train_rus).value_counts())
print(pd.Series(y_test_rus).value_counts())

0    3444
1     344
Name: count, dtype: int64
0    1476
1     148
Name: count, dtype: int64


In [17]:
df = X.copy()
df['Class'] = y

# Appliquez le sous-échantillonnage
rus = RandomUnderSampler(sampling_strategy=0.1, random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus,y_rus, train_size=0.7, random_state=42)
#pd.Series(y_train_).value_counts()

# Créez DataFrames à partir des résultats du sous-échantillonnage
df_rus = pd.DataFrame(X_rus, columns=X.columns)
df_rus['Class'] = y_rus

# Conservez les indices des données retenues
indices_retained = df_rus.index

# Identifiez les indices non retenus
indices_non_retenus = np.setdiff1d(df.index, indices_retained)

# Créez le DataFrame des exemples non retenus
df_non_retenus = df.loc[indices_non_retenus]

# Séparez les caractéristiques et les labels pour les exemples non retenus
X_not_retained = df_non_retenus.drop(columns='Class')
y_not_retained = df_non_retenus['Class']

In [18]:
# Initialiser un dictionnaire vide pour stocker les modèles entraînés

model_name_in_folder = "RUS_01_models_fitted_with_best_parameters"

# Charger les modèles et résultats sauvegardés
trained_models_rus, models_pr_dict_RUS = load_checkpoint_model(checkpoint_dir=checkpoint_dir, model_name_in_folder=model_name_in_folder)

# Dictionnaire contenant les modèles avant entraînement

models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0,random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "RandomForest": RandomForestClassifier(random_state=42)
}

# Boucle d'entraînement des modèles
for model_name, model in models.items():
    if model_name in trained_models_rus :
        print(f"{model_name} has already been trained. Loading from the dictionary.")
        continue 

    model.fit(X_train_rus, y_train_rus)
    y_pred_rus = model.predict(X_test_rus)
    
    trained_models_rus[model_name] = model
    models_pr_dict_RUS[model_name] = calculate_metrics(model, X_test_rus, y_test_rus)

    save_checkpoint(checkpoint_dir, model_name_in_folder, trained_models_rus, models_pr_dict_RUS)

# Sauvegarde finale du dictionnaire des modèles
save_final_model("../models/models_fitted/", model_name_in_folder, trained_models_rus)


Checkpoint loaded from ../models/checkpoints/RUS_01_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS_01_models_fitted_with_best_parameters_results.pkl
KNN has already been trained. Loading from the dictionary.
SVM has already been trained. Loading from the dictionary.
XGBoost has already been trained. Loading from the dictionary.
CatBoost has already been trained. Loading from the dictionary.
LightGBM has already been trained. Loading from the dictionary.
RandomForest has already been trained. Loading from the dictionary.
Models saved in ../models/models_fitted/RUS_01_models_fitted_with_best_parameters.pkl


In [19]:
# Dictionnaire des modèles
models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "RandomForest": RandomForestClassifier(random_state=42)
}

# Paramètres de sous-échantillonnage à tester
sampling_strategies = [0.1, 0.2, 0.3, 0.4]
model_name_in_folder = "RUS-sampling_strategies_models_fitted_with_best_parameters"

# Charger les modèles et résultats sauvegardés
trained_models_rus, results = load_checkpoint_model(checkpoint_dir=checkpoint_dir, model_name_in_folder=model_name_in_folder)

# Boucle sur les taux de sous-échantillonnage
for sampling_strategy in sampling_strategies:
    # Appliquer le sous-échantillonnage
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_res, y_res = rus.fit_resample(X, y)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

    # Dictionnaire pour stocker les performances des modèles
    model_performance = {}
    models_dict = {}

    # Boucle sur les modèles
    for model_name, model in tqdm(models.items()):
        # Vérifiez si le modèle a déjà été entraîné pour cette stratégie d'échantillonnage
        if (
            str(sampling_strategy) in trained_models_rus and 
            model_name in trained_models_rus[str(sampling_strategy)]
        ):
            print(f"{model_name} has already been trained with sampling strategy {sampling_strategy}. Skipping.")
            # Charger le modèle entraîné depuis le dictionnaire
            models_dict[model_name] = trained_models_rus[str(sampling_strategy)][model_name]
            # Évaluer le modèle sans le réentraîner
            y_pred = models_dict[model_name].predict(X_test)
            y_prob = models_dict[model_name].predict_proba(X_test)[:, 1]
        else:
            # Entraîner le modèle
            model.fit(X_train, y_train)

            # Prédire sur l'ensemble de test
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]  # Probabilité de la classe positive

            # Stocker le modèle entraîné
            models_dict[model_name] = model

        # Évaluer le modèle
        report = classification_report(y_test, y_pred, output_dict=True)
        f1_score_ = report['1']['f1-score']  # F1-score pour la classe positive
        recall_ = report['1']['recall']      # Rappel pour la classe positive

        # Calcul de la courbe de précision-rappel
        precision, recall_curve, _ = precision_recall_curve(y_test, y_prob)
        auc_pr = auc(recall_curve, precision)  # AUC-PR

        # Stocker les performances
        model_performance[model_name] = {
            'F1 Score': f1_score_,
            'Recall': recall_,
            'AUC PR': auc_pr
        }

    # Enregistrer les performances pour ce taux de sous-échantillonnage
    results[str(sampling_strategy)] = model_performance
    trained_models_rus[str(sampling_strategy)] = models_dict
    # Sauvegarder les checkpoints après chaque stratégie
    save_checkpoint(checkpoint_dir, model_name_in_folder, trained_models_rus, results)

# Sauvegarde finale du dictionnaire des modèles
save_final_model("../models/models_fitted/", model_name_in_folder, trained_models_rus)


Checkpoint loaded from ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_results.pkl


 33%|███▎      | 2/6 [00:00<00:00, 14.86it/s]

KNN has already been trained with sampling strategy 0.1. Skipping.
SVM has already been trained with sampling strategy 0.1. Skipping.
XGBoost has already been trained with sampling strategy 0.1. Skipping.
CatBoost has already been trained with sampling strategy 0.1. Skipping.
LightGBM has already been trained with sampling strategy 0.1. Skipping.
RandomForest has already been trained with sampling strategy 0.1. Skipping.


100%|██████████| 6/6 [00:00<00:00, 33.99it/s]


Checkpoint saved successfully at ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_results.pkl


100%|██████████| 6/6 [00:00<00:00, 88.64it/s]


KNN has already been trained with sampling strategy 0.2. Skipping.
SVM has already been trained with sampling strategy 0.2. Skipping.
XGBoost has already been trained with sampling strategy 0.2. Skipping.
CatBoost has already been trained with sampling strategy 0.2. Skipping.
LightGBM has already been trained with sampling strategy 0.2. Skipping.
RandomForest has already been trained with sampling strategy 0.2. Skipping.
Checkpoint saved successfully at ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_results.pkl


  0%|          | 0/6 [00:00<?, ?it/s]

KNN has already been trained with sampling strategy 0.3. Skipping.
SVM has already been trained with sampling strategy 0.3. Skipping.
XGBoost has already been trained with sampling strategy 0.3. Skipping.
CatBoost has already been trained with sampling strategy 0.3. Skipping.
LightGBM has already been trained with sampling strategy 0.3. Skipping.


100%|██████████| 6/6 [00:00<00:00, 90.62it/s]


RandomForest has already been trained with sampling strategy 0.3. Skipping.
Checkpoint saved successfully at ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_results.pkl


100%|██████████| 6/6 [00:00<00:00, 121.25it/s]


KNN has already been trained with sampling strategy 0.4. Skipping.
SVM has already been trained with sampling strategy 0.4. Skipping.
XGBoost has already been trained with sampling strategy 0.4. Skipping.
CatBoost has already been trained with sampling strategy 0.4. Skipping.
LightGBM has already been trained with sampling strategy 0.4. Skipping.
RandomForest has already been trained with sampling strategy 0.4. Skipping.
Checkpoint saved successfully at ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/RUS-sampling_strategies_models_fitted_with_best_parameters_results.pkl
Models saved in ../models/models_fitted/RUS-sampling_strategies_models_fitted_with_best_parameters.pkl


## 5.2 - Smote with StratifiedKFold

In [20]:
best_models_sm = {}
results_sm = {}

# Dictionnaire des modèles
models = {
    "KNN": KNeighborsClassifier(),
    #"SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42)
}

# Paramètres de recherche pour SMOTE et les modèles
param_grids = {
    'KNN': {
        'smote__sampling_strategy': [0.1, 0.3, 0.5],
        'smote__k_neighbors': [3, 5, 7],
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    #'SVM': {
    #    'smote__sampling_strategy': [0.1, 0.3, 0.5],
    #    'smote__k_neighbors': [3, 5],
    #    'model__C': [0.1, 1, 10],
    #    'model__kernel': ['linear', 'rbf']
    #},
    'XGBoost': {
        'smote__sampling_strategy': [0.1, 0.3, 0.5],
        'smote__k_neighbors': [3, 5],
        'model__learning_rate': [0.01, 0.1],
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 5]
    },
    'CatBoost': {
        'smote__sampling_strategy': [0.1, 0.3, 0.5],
        'smote__k_neighbors': [3, 5],
        'model__iterations': [100, 200],
        'model__depth': [4, 6],
        'model__learning_rate': [0.01, 0.1]
    },
    'RandomForest': {
        'smote__sampling_strategy': [0.1, 0.3, 0.5],
        'smote__k_neighbors': [3, 5],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10],
        'model__min_samples_split': [2, 5]
    }
}

In [21]:
model_name_in_folder = "SMOTE+models_fitted_with_best_parameters"
best_models_sm, results_sm = load_checkpoint_model(checkpoint_dir=checkpoint_dir, model_name_in_folder=model_name_in_folder)

# Séparation des données en entraînement et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Validation croisée avec StratifiedKFold (5-folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Pour chaque modèle, exécuter GridSearchCV et collecter les meilleurs modèles
for model_name, model in models.items() :
    if model_name in best_models_sm :
        print(f"{model_name} has already been trained. Skipping.")
        continue  # On passe au modèle suivant
    

    print(f"Training for {model_name}")
    # Pipeline avec SMOTE appliqué uniquement aux données d'entraînement
    pipeline = imPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])
    
    # Définir les scores à optimiser
    scorers = {
        'F1': make_scorer(f1_score, pos_label=1),
        'Recall': make_scorer(recall_score, pos_label=1)
    }
    
    # GridSearchCV avec validation croisée StratifiedKFold
    grid_search = GridSearchCV(
        pipeline,
        param_grids[model_name],
        cv=skf,  # Validation croisée avec StratifiedKFold
        scoring=scorers,
        refit='F1',  # Optimisation par F1-score
        n_jobs=-1,  # Utilisation de tous les CPU disponibles
        error_score='raise'  # Pour obtenir plus de détails sur les erreurs
    )
    
    # Entraînement du modèle avec GridSearchCV sur les données d'entraînement
    grid_search.fit(X_train, y_train)
    
    # Meilleur modèle
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Évaluation du modèle sur le jeu de test (non suréchantillonné)
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]
    
    # Calcul des métriques sur les données de test
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision, recall_curve, _ = precision_recall_curve(y_test, y_prob)
    auc_pr = auc(recall_curve, precision)
    
    # Enregistrer le modèle et ses performances dans le dictionnaire
    best_models_sm[model_name] = best_model
    
    # Enregistrer les résultats et les meilleurs paramètres
    results_sm[model_name] = {
        'best_params': best_params,
        'F1 Score': f1,
        'Recall': recall,
        'AUC PR': auc_pr
    }
    save_checkpoint(checkpoint_dir, model_name_in_folder, best_models_sm, results_sm)
print("End of training.")
# Sauvegarde finale du dictionnaire des modèles
save_final_model("../models/models_fitted/", model_name_in_folder, best_models_sm)

Checkpoint loaded from ../models/checkpoints/SMOTE+models_fitted_with_best_parameters_checkpoint.pkl and ../models/checkpoints/SMOTE+models_fitted_with_best_parameters_results.pkl
KNN has already been trained. Skipping.
XGBoost has already been trained. Skipping.
CatBoost has already been trained. Skipping.
RandomForest has already been trained. Skipping.
End of training.
Models saved in ../models/models_fitted/SMOTE+models_fitted_with_best_parameters.pkl
