In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'  
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'

In [2]:
import numpy as np
import pandas as pd
import time

# Pour les pipelines imblearn
from imblearn.pipeline import Pipeline as ImbPipeline

# Sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

# RandomForest
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV


In [8]:
# Load the data
train_df = pd.read_parquet("../preprocessed_train_data.parquet")
test_df = pd.read_parquet("../preprocessed_test_data.parquet")


In [38]:
X_train = train_df.drop(columns=["FlagImpaye"])
y_train = train_df["FlagImpaye"]

In [39]:
X_test = test_df.drop(columns=["FlagImpaye"])
y_test = test_df["FlagImpaye"]

In [40]:
def make_pipeline(method):
    """
    Fabrique une pipeline selon la méthode de sampling.
    'clf' sera le RandomForest. 
    """
    if method == "Undersampled":
        pipe = ImbPipeline([
            ('under', RandomUnderSampler(random_state=42)),
            ('clf', RandomForestClassifier(random_state=42))
        ])

    elif method == "Oversampled (SMOTE)":
        pipe = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('clf', RandomForestClassifier(random_state=42))
        ])
        
    elif method == "Oversampled (ADASYN)":
        pipe = ImbPipeline([
            ('adasyn', ADASYN(random_state=42)),
            ('clf', RandomForestClassifier(random_state=42))
        ])
        
    elif method == "Hybrid (SMOTE+Tomek)":
        pipe = ImbPipeline([
            ('smote_tomek', SMOTETomek(random_state=42)),
            ('clf', RandomForestClassifier(random_state=42))
        ])
    else:
        raise ValueError(f"Méthode {method} inconnue.")
        
    return pipe

In [41]:
# Liste des approches qu'on veut tester
methods = [
    "Undersampled",
    "Oversampled (SMOTE)",
    "Oversampled (ADASYN)",
    "Hybrid (SMOTE+Tomek)"
]

In [None]:

# Définir un param_grid pour chaque pipeline

param_grids = {

    "Undersampled": {
        'under__sampling_strategy': [0.5, 1.0],  
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [10, 20],
    },
    
    "Oversampled (SMOTE)": {
        'smote__sampling_strategy': [0.5, 1.0],
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [10, 20],
    },
    
    "Oversampled (ADASYN)": {
        'adasyn__sampling_strategy': [0.5, 1.0],
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [10, 20],
    },
    
    "Hybrid (SMOTE+Tomek)": {
        'smote_tomek__sampling_strategy': [0.5, 1.0],
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [10, 20],
    }
    
}

In [None]:


# Boucler sur chaque pipeline + gridsearch

def run_all_pipelines_with_gridsearch(X, y):
    """
    Cette fonction boucle sur chaque methode de sampling,
    lance un GridSearchCV pour trouver la meilleure combo
    d'hyperparametres. On retient la F1-score comme scoring.
    Renvoie un DataFrame avec un résumé des résultats.
    """
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = {
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall',
    'pr_auc': 'average_precision'
}
    # On stocke les resultats
    results_list = []
    
    for method in methods:
        print(f"\n=== Méthode: {method} ===")
        
        pipeline = make_pipeline(method)
        param_grid = param_grids[method]
        
        # On utilise la F1 comme métrique principale
        
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            scoring=scoring,     
            cv=cv,
             refit='f1',
            n_jobs=12,        
            verbose=1
        )
        
        start_time = time.time()
        grid_search.fit(X, y)
        end_time = time.time()
        
        best_f1 = grid_search.best_score_
        best_params = grid_search.best_params_
        exec_time = end_time - start_time
        
        print(f"Meilleure F1 trouvée: {best_f1:.4f}")
        print(f"Meilleurs paramètres: {best_params}")
        print(f"Temps d'execution: {exec_time:.2f}s")
        
        results_list.append({
            'Method': method,
            'Best F1': best_f1,
            'Best Params': best_params,
            'Execution Time (s)': exec_time
        })
        
    return pd.DataFrame(results_list)

In [None]:
results_df = run_all_pipelines_with_gridsearch(X_train, y_train)
display(results_df)


=== Méthode: Undersampled ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleure F1 trouvée: 0.0866
Meilleurs paramètres: {'clf__max_depth': 20, 'clf__n_estimators': 100, 'under__sampling_strategy': 0.5}
Temps d'execution: 192.53s

=== Méthode: Oversampled (SMOTE) ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleure F1 trouvée: 0.1685
Meilleurs paramètres: {'clf__max_depth': 20, 'clf__n_estimators': 100, 'smote__sampling_strategy': 0.5}
Temps d'execution: 20022.63s

=== Méthode: Oversampled (ADASYN) ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleure F1 trouvée: 0.1684
Meilleurs paramètres: {'adasyn__sampling_strategy': 0.5, 'clf__max_depth': 20, 'clf__n_estimators': 100}
Temps d'execution: 17652.11s

=== Méthode: Hybrid (SMOTE+Tomek) ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleure F1 trouvée: 0.1685
Meilleurs paramètres: {'clf__max_depth': 20, 'clf__n_estimators': 100, 'smote_tomek__sampling_strat

Unnamed: 0,Method,Best F1,Best Params,Execution Time (s)
0,Undersampled,0.086578,"{'clf__max_depth': 20, 'clf__n_estimators': 10...",192.532778
1,Oversampled (SMOTE),0.168499,"{'clf__max_depth': 20, 'clf__n_estimators': 10...",20022.630821
2,Oversampled (ADASYN),0.168384,"{'adasyn__sampling_strategy': 0.5, 'clf__max_d...",17652.105721
3,Hybrid (SMOTE+Tomek),0.1685,"{'clf__max_depth': 20, 'clf__n_estimators': 10...",21313.250225


In [52]:
#On cherche la ligne qui a la F1 la plus elevée
best_idx = results_df['Best F1'].idxmax()
best_method = results_df.loc[best_idx, 'Method']
best_f1_score = results_df.loc[best_idx, 'Best F1']
best_params = results_df.loc[best_idx, 'Best Params']

print(f"\nMeilleure méthode: {best_method}")
print(f"Meilleure F1 en cross-val: {best_f1_score:.4f}")
print(f"Meilleurs paramètres: {best_params}")


Meilleure méthode: Hybrid (SMOTE+Tomek)
Meilleure F1 en cross-val: 0.1685
Meilleurs paramètres: {'clf__max_depth': 20, 'clf__n_estimators': 100, 'smote_tomek__sampling_strategy': 0.5}


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
# Barplot for F1 Scores
plt.figure(figsize=(10, 6))
sns.barplot(x='Best F1', y='Method', data=results_df, palette='viridis')
plt.title('F1 Score Comparison by Sampling Method')
plt.show()

NameError: name 'results_df' is not defined

<Figure size 1000x600 with 0 Axes>