# Imports

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
import joblib
import optuna
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Settings

In [14]:
# Exports
LOCAL_EXPORT_FOLDER_PATH='/content/exports'
# Exports > Manual check path 
LOCAL_EXPORT_MANUAL_CHECK_PATCH_FOLDER_PATH=LOCAL_EXPORT_FOLDER_PATH+'/manual_check_patch'
TARGET_COLUMNS=['TARGET',]
LOCAL_EXPORT_MODELIZATION_FOLDER_PATH=LOCAL_EXPORT_FOLDER_PATH+'/modelization'
MLFLOW_EXPERIMENT_NAME = 'generic_model_experiment'
# Export > General Settings
TESTING_MODE=True
TESTING_MODE_MAX_LINES=1000
TESTING_MODE_SUB_FOLDER_NAME='testing_data'
GENERAL_CHUNK_SIZE=100000

In [26]:
def display_head_of_files(base_path, file_extension='csv', chunk_size=1000):
    """
    Parcours tous les fichiers dans le répertoire donné et affiche les premières lignes de chaque fichier CSV.

    Args:
        base_path (str): Le chemin du répertoire de base où se trouvent les fichiers.
        file_extension (str): L'extension des fichiers à traiter (par défaut 'csv').
        chunk_size (int): Taille des chunks pour lire les fichiers partiellement.

    Returns:
        None
    """
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith(file_extension):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                
                # Lire les premières lignes du fichier CSV
                for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                    print(chunk.head())
                    break  # On ne lit qu'un seul chunk pour obtenir le head
                print("\n" + "="*80 + "\n")

# Exemple d'utilisation

In [27]:
display_head_of_files(base_path=LOCAL_EXPORT_MANUAL_CHECK_PATCH_FOLDER_PATH)

Processing file: /content/exports/manual_check_patch/testing_data/mean/LOF/ordinal/application_test.csv
   AMT_INCOME_TOTAL  NONLIVINGAPARTMENTS_MODE  FLAG_DOCUMENT_9  \
0            180000                  0.007361                0   
1            180000                  0.007361                0   
2            166500                  0.007361                0   
3             67500                  0.007361                0   
4            247500                  0.007361                0   

   TOTAL_CREDIT_BUREAU_REQUESTS  FLAG_EMAIL  HOUSETYPE_MODE  \
0                             0           0             0.0   
1                             1           0             0.0   
2                             2           0             0.0   
3                             0           0             0.0   
4                             4           0             0.0   

   LIVINGAPARTMENTS_MODE  FLAG_CONT_MOBILE  REGION_RATING_CLIENT  \
0                0.10255                 1          

# Models

In [25]:
# Liste des modèles et leurs hyperparamètres
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': np.logspace(-3, 3, 7),
            'solver': ['newton-cg', 'lbfgs', 'liblinear']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    # 'Gradient Boosting': {
    #     'model': GradientBoostingClassifier(),
    #     'params': {
    #         'n_estimators': [100, 200, 300],
    #         'learning_rate': [0.01, 0.05, 0.1],
    #         'max_depth': [3, 4, 5],
    #         'subsample': [0.8, 0.9, 1.0]
    #     }
    # },
    # 'XGBoost': {
    #     'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    #     'params': {
    #         'n_estimators': [100, 200, 300],
    #         'learning_rate': [0.01, 0.05, 0.1],
    #         'max_depth': [3, 4, 5],
    #         'colsample_bytree': [0.3, 0.7]
    #     }
    # },
    # 'LightGBM': {
    #     'model': lgb.LGBMClassifier(),
    #     'params': {
    #         'n_estimators': [100, 200, 300],
    #         'learning_rate': [0.01, 0.05, 0.1],
    #         'num_leaves': [31, 62, 127],
    #         'boosting_type': ['gbdt', 'dart']
    #     }
    # },
    # 'CatBoost': {
    #     'model': cb.CatBoostClassifier(verbose=0),
    #     'params': {
    #         'iterations': [100, 200, 300],
    #         'learning_rate': [0.01, 0.05, 0.1],
    #         'depth': [3, 4, 5],
    #         'l2_leaf_reg': [3, 5, 7]
    #     }
    # },
    # 'SVM': {
    #     'model': SVC(probability=True),
    #     'params': {
    #         'C': np.logspace(-3, 3, 7),
    #         'kernel': ['linear', 'rbf', 'poly'],
    #         'degree': [3, 4, 5]
    #     }
    # },
    # 'KNN': {
    #     'model': KNeighborsClassifier(),
    #     'params': {
    #         'n_neighbors': [5, 10, 20],
    #         'weights': ['uniform', 'distance'],
    #         'metric': ['euclidean', 'manhattan']
    #     }
    # },
    # 'Neural Network': {
    #     'model': MLPClassifier(max_iter=500),
    #     'params': {
    #         'hidden_layer_sizes': ['50,50', '100', '100,50'],
    #         'activation': ['tanh', 'relu'],
    #         'alpha': [0.0001, 0.001, 0.01]
    #     }
    # }
}

# Entrainment

## Hyperparameter optimization method

In [28]:
# Fonction d'optimisation des hyperparamètres avec Optuna
def objective(trial, X_train, y_train):
    classifier_name = trial.suggest_categorical('classifier', list(models.keys()))
    classifier_info = models[classifier_name]
    classifier = classifier_info['model']
    params = classifier_info['params']

    trial_params = {}
    for param, values in params.items():
        if param == 'hidden_layer_sizes':
            hidden_layer_size_str = trial.suggest_categorical(param, values)
            trial_params[param] = tuple(map(int, hidden_layer_size_str.split(',')))
        elif isinstance(values[0], int):
            trial_params[param] = trial.suggest_int(param, min(values), max(values))
        elif isinstance(values[0], float):
            trial_params[param] = trial.suggest_float(param, min(values), max(values))
        else:
            trial_params[param] = trial.suggest_categorical(param, values)
    
    classifier.set_params(**trial_params)
    
    pipeline = Pipeline(steps=[
        ('classifier', classifier)
    ])
    
    score = cross_val_score(pipeline, X_train, y_train, n_jobs=-1, cv=5, scoring='accuracy', error_score='raise')
    accuracy = score.mean()
    return accuracy

## Optuna optimize hyperameters

In [None]:
# Fonction d'optimisation des hyperparamètres avec Optuna
def objective(trial, X_train, y_train):
    classifier_name = trial.suggest_categorical('classifier', list(models.keys()))
    classifier_info = models[classifier_name]
    classifier = classifier_info['model']
    params = classifier_info['params']

    trial_params = {}
    for param, values in params.items():
        if param == 'hidden_layer_sizes':
            hidden_layer_size_str = trial.suggest_categorical(param, values)
            trial_params[param] = tuple(map(int, hidden_layer_size_str.split(',')))
        elif isinstance(values[0], int):
            trial_params[param] = trial.suggest_int(param, min(values), max(values))
        elif isinstance(values[0], float):
            trial_params[param] = trial.suggest_float(param, min(values), max(values))
        else:
            trial_params[param] = trial.suggest_categorical(param, values)
    
    classifier.set_params(**trial_params)
    
    pipeline = Pipeline(steps=[
        ('classifier', classifier)
    ])
    
    score = cross_val_score(pipeline, X_train, y_train, n_jobs=-1, cv=5, scoring='accuracy', error_score='raise')
    accuracy = score.mean()
    return accuracy

# Re-entrainment method for best increase

# Fonction pour ré-entraîner un modèle jusqu'à atteindre une amélioration significative
def retrain_model(best_pipeline, X_train, y_train, X_test, y_test, threshold=0.01, max_iter=10):
    previous_score = 0
    for iteration in range(max_iter):
        best_pipeline.fit(X_train, y_train)
        y_pred = best_pipeline.predict(X_test)
        current_score = accuracy_score(y_test, y_pred)
        improvement = current_score - previous_score
        if improvement < threshold:
            break
        previous_score = current_score
        print(f"Iteration {iteration + 1}, Accuracy: {current_score}, Improvement: {improvement}")
    return best_pipeline, current_score

## Principal to entrain model

In [None]:
# Fonction principale pour entraîner et évaluer les modèles
def train_and_evaluate_models(base_path, output_folder, target_columns, max_features=5, testing=False, chunk_size=1000, testing_sub_path_name='test'):
    all_scores = {}
    total_files = sum([len(files) for r, d, files in os.walk(base_path) if any(f.endswith('application_train.csv') for f in files)])
    pbar = tqdm(total=total_files, desc="Processing files")

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('application_train.csv'):
                file_path = os.path.join(root, file)
                test_file_path = file_path.replace('application_train.csv', 'application_test.csv')

                print(f"Processing file: {file_path}")

                # Lire les fichiers CSV par chunks
                for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                    test_data = pd.read_csv(test_file_path)

                    for target_column in target_columns:
                        print(f"Using target column: {target_column}")

                        # Calculer les corrélations et sélectionner les meilleures caractéristiques
                        correlations = chunk.corr()[target_column].abs().sort_values(ascending=False)
                        top_features = correlations.index[1:max_features+1].tolist()

                        # Séparation des features et de la cible
                        X_train = chunk[top_features]
                        y_train = chunk[target_column]
                        
                        if target_column in test_data.columns:
                            X_test = test_data[top_features]
                            y_test = test_data[target_column]
                        else:
                            X_test = test_data
                            y_test = None
                            print(f"Target column {target_column} not found in test data. Skipping evaluation.")

                        # Optimisation des hyperparamètres avec Optuna
                        study = optuna.create_study(direction='maximize')
                        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100)

                        print('Number of finished trials: ', len(study.trials))
                        print('Best trial:')
                        trial = study.best_trial

                        print('  Value: ', trial.value)
                        print('  Params: ')
                        for key, value in trial.params.items():
                            print('    {}: {}'.format(key, value))

                        # Entraîner le meilleur modèle
                        best_classifier_name = trial.params['classifier']
                        best_classifier_info = models[best_classifier_name]
                        best_classifier = best_classifier_info['model']
                        best_params = {k: v for k, v in trial.params.items() if k != 'classifier'}

                        best_classifier.set_params(**best_params)

                        # Création du pipeline avec le meilleur modèle
                        best_pipeline = Pipeline(steps=[
                            ('classifier', best_classifier)
                        ])

                        best_pipeline, best_score = retrain_model(best_pipeline, X_train, y_train, X_test, y_test)

                        # Stocker les scores de validation croisée
                        if best_classifier_name not in all_scores:
                            all_scores[best_classifier_name] = []
                        all_scores[best_classifier_name].extend(study.trials_dataframe().value.values)

                        # Déterminer le chemin de sortie
                        relative_path = os.path.relpath(root, base_path)
                        if testing:
                            output_dir = os.path.join(output_folder, testing_sub_path_name, relative_path, target_column)
                        else:
                            output_dir = os.path.join(output_folder, relative_path, target_column)
                            
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        model_path = os.path.join(output_dir, f'best_{best_classifier_name}_model.pkl')
                        joblib.dump(best_pipeline, model_path)

                        if y_test is not None:
                            # Évaluation du modèle
                            y_pred = best_pipeline.predict(X_test)
                            y_pred_proba = best_pipeline.predict_proba(X_test)
                            accuracy = accuracy_score(y_test, y_pred)
                            roc_auc = roc_auc_score(y_test, y_pred_proba)

                            print(f"Accuracy: {accuracy}")
                            print(f"ROC AUC: {roc_auc}")
                            print(confusion_matrix(y_test, y_pred))
                            print(classification_report(y_test, y_pred))

                            # Logging avec mlflow
                            mlflow.set_experiment('credit_scoring')
                            with mlflow.start_run():
                                mlflow.log_params(trial.params)
                                mlflow.log_metric('accuracy', accuracy)
                                mlflow.log_metric('roc_auc', roc_auc)
                                mlflow.sklearn.log_model(best_pipeline, 'model')
                                mlflow.log_artifact(file_path)
                                mlflow.log_artifact(test_file_path)

                            print(f'Model saved at {model_path}')

                        else:
                            print(f"Skipping model evaluation for {target_column} as target column is not in test data.")

                pbar.update(1)

    pbar.close()

    # Visualisation des scores de validation croisée
    model_names = list(all_scores.keys())
    model_scores = [score for scores in all_scores.values() for score in scores]
    model_names_repeated = [model for model in model_names for _ in range(len(all_scores[model]))]

    plot_cross_val_scores(model_scores, model_names_repeated, output_folder)

# Fonction de visualisation pour les scores de validation croisée
def plot_cross_val_scores(model_scores, model_names, output_dir):
    plt.figure(figsize=(12, 8))
    sns.boxplot(x=model_names, y=model_scores)
    plt.xlabel('Model')
    plt.ylabel('Cross-Validation Score')
    plt.title('Model Comparison - Cross-Validation Scores')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_dir, 'cross_val_scores.png'))
    plt.close()

## Calling modelization method

In [24]:
train_and_evaluate_models(
    base_path=LOCAL_EXPORT_MANUAL_CHECK_PATCH_FOLDER_PATH, 
    output_folder=LOCAL_EXPORT_MODELIZATION_FOLDER_PATH, 
    target_columns=TARGET_COLUMNS, 
    testing=TESTING_MODE, 
    chunk_size=GENERAL_CHUNK_SIZE, 
    testing_sub_path_name=TESTING_MODE_SUB_FOLDER_NAME)

[I 2024-07-01 11:56:38,644] A new study created in memory with name: no-name-6cc39c00-1d89-4c6f-9329-db733cd2b64b


Processing file: /content/exports/manual_check_patch/testing_data/mean/LOF/ordinal/application_train.csv
Using target column: TARGET
Target column TARGET not found in test data. Skipping evaluation.


[I 2024-07-01 11:56:39,862] Trial 0 finished with value: 0.9181818181818182 and parameters: {'classifier': 'Random Forest', 'n_estimators': 170, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9181818181818182.
[I 2024-07-01 11:56:40,856] Trial 1 finished with value: 0.9181818181818182 and parameters: {'classifier': 'Random Forest', 'n_estimators': 196, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9181818181818182.
[I 2024-07-01 11:56:41,806] Trial 2 finished with value: 0.9181818181818182 and parameters: {'classifier': 'Random Forest', 'n_estimators': 195, 'max_depth': 22, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9181818181818182.
[I 2024-07-01 11:56:42,662] Trial 3 finished with value: 0.9181818181818182 and parameters: {'classifier': 'Random Forest', 'n_estimators': 241, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 

KeyboardInterrupt: 

In [None]:
# Fonction d'optimisation des hyperparamètres avec Optuna
def objective(trial, X_train, y_train):
    classifier_name = trial.suggest_categorical('classifier', list(models.keys()))
    classifier_info = models[classifier_name]
    classifier = classifier_info['model']
    params = classifier_info['params']

    trial_params = {}
    for param, values in params.items():
        if param == 'hidden_layer_sizes':
            hidden_layer_size_str = trial.suggest_categorical(param, values)
            trial_params[param] = tuple(map(int, hidden_layer_size_str.split(',')))
        elif isinstance(values[0], int):
            trial_params[param] = trial.suggest_int(param, min(values), max(values))
        elif isinstance(values[0], float):
            trial_params[param] = trial.suggest_float(param, min(values), max(values))
        else:
            trial_params[param] = trial.suggest_categorical(param, values)
    
    classifier.set_params(**trial_params)
    
    pipeline = Pipeline(steps=[
        ('classifier', classifier)
    ])
    
    score = cross_val_score(pipeline, X_train, y_train, n_jobs=-1, cv=5, scoring='accuracy', error_score='raise')
    accuracy = score.mean()
    return accuracy

# Fonction pour ré-entraîner un modèle jusqu'à atteindre une amélioration significative
def retrain_model(best_pipeline, X_train, y_train, X_test, y_test, threshold=0.01, max_iter=10):
    previous_score = 0
    for iteration in range(max_iter):
        best_pipeline.fit(X_train, y_train)
        y_pred = best_pipeline.predict(X_test)
        current_score = accuracy_score(y_test, y_pred)
        improvement = current_score - previous_score
        if improvement < threshold:
            break
        previous_score = current_score
        print(f"Iteration {iteration + 1}, Accuracy: {current_score}, Improvement: {improvement}")
    return best_pipeline, current_score

# Fonction principale pour entraîner et évaluer les modèles
def train_and_evaluate_models(base_path, output_folder, target_columns, max_features=5, testing=False, chunk_size=1000, testing_sub_path_name='test'):
    all_scores = {}
    total_files = sum([len(files) for r, d, files in os.walk(base_path) if any(f.endswith('application_train.csv') for f in files)])
    pbar = tqdm(total=total_files, desc="Processing files")

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('application_train.csv'):
                file_path = os.path.join(root, file)
                test_file_path = file_path.replace('application_train.csv', 'application_test.csv')

                print(f"Processing file: {file_path}")

                # Lire les fichiers CSV par chunks
                for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                    test_data = pd.read_csv(test_file_path)

                    for target_column in target_columns:
                        print(f"Using target column: {target_column}")

                        # Calculer les corrélations et sélectionner les meilleures caractéristiques
                        correlations = chunk.corr()[target_column].abs().sort_values(ascending=False)
                        top_features = correlations.index[1:max_features+1].tolist()

                        # Séparation des features et de la cible
                        X_train = chunk[top_features]
                        y_train = chunk[target_column]
                        
                        if target_column in test_data.columns:
                            X_test = test_data[top_features]
                            y_test = test_data[target_column]
                        else:
                            X_test = test_data
                            y_test = None
                            print(f"Target column {target_column} not found in test data. Skipping evaluation.")

                        # Optimisation des hyperparamètres avec Optuna
                        study = optuna.create_study(direction='maximize')
                        study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100)

                        print('Number of finished trials: ', len(study.trials))
                        print('Best trial:')
                        trial = study.best_trial

                        print('  Value: ', trial.value)
                        print('  Params: ')
                        for key, value in trial.params.items():
                            print('    {}: {}'.format(key, value))

                        # Entraîner le meilleur modèle
                        best_classifier_name = trial.params['classifier']
                        best_classifier_info = models[best_classifier_name]
                        best_classifier = best_classifier_info['model']
                        best_params = {k: v for k, v in trial.params.items() if k != 'classifier'}

                        best_classifier.set_params(**best_params)

                        # Création du pipeline avec le meilleur modèle
                        best_pipeline = Pipeline(steps=[
                            ('classifier', best_classifier)
                        ])

                        best_pipeline, best_score = retrain_model(best_pipeline, X_train, y_train, X_test, y_test)

                        # Stocker les scores de validation croisée
                        all_scores[best_classifier_name] = study.trials_dataframe().value.values

                        # Déterminer le chemin de sortie
                        relative_path = os.path.relpath(root, base_path)
                        if testing:
                            output_dir = os.path.join(output_folder, testing_sub_path_name, relative_path, target_column)
                        else:
                            output_dir = os.path.join(output_folder, relative_path, target_column)
                            
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        model_path = os.path.join(output_dir, f'best_{best_classifier_name}_model.pkl')
                        joblib.dump(best_pipeline, model_path)

                        if y_test is not None:
                            # Évaluation du modèle
                            y_pred = best_pipeline.predict(X_test)
                            y_pred_proba = best_pipeline.predict_proba(X_test)
                            accuracy = accuracy_score(y_test, y_pred)
                            roc_auc = roc_auc_score(y_test, y_pred_proba)

                            print(f"Accuracy: {accuracy}")
                            print(f"ROC AUC: {roc_auc}")
                            print(confusion_matrix(y_test, y_pred))
                            print(classification_report(y_test, y_pred))

                            # Logging avec mlflow
                            mlflow.set_experiment('credit_scoring')
                            with mlflow.start_run():
                                mlflow.log_params(trial.params)
                                mlflow.log_metric('accuracy', accuracy)
                                mlflow.log_metric('roc_auc', roc_auc)
                                mlflow.sklearn.log_model(best_pipeline, 'model')
                                mlflow.log_artifact(file_path)
                                mlflow.log_artifact(test_file_path)

                            print(f'Model saved at {model_path}')

                        else:
                            print(f"Skipping model evaluation for {target_column} as target column is not in test data.")

                pbar.update(1)

    pbar.close()

    # Visualisation des scores de validation croisée
    model_names = list(all_scores.keys())
    model_scores = [score for scores in all_scores.values() for score in scores]
    model_names_repeated = [model for model in model_names for _ in range(len(all_scores[model]))]

    plot_cross_val_scores(model_scores, model_names_repeated, output_folder)

# Fonction de visualisation pour les scores de validation croisée
def plot_cross_val_scores(model_scores, model_names, output_dir):
    plt.figure(figsize=(12, 8))
    sns.boxplot(x=model_names, y=model_scores)
    plt.xlabel('Model')
    plt.ylabel('Cross-Validation Score')
    plt.title('Model Comparison - Cross-Validation Scores')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_dir, 'cross_val_scores.png'))
    plt.close()

# Évaluation des Modèles

In [None]:
def evaluate_models(base_path, output_folder, target_columns, chunk_size=1000):
    performance_metrics = []
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith('.pkl') and any(col in root for col in target_columns):
                model_path = os.path.join(root, file)
                model_name = file.split('_')[1]
                target_column = [col for col in target_columns if col in root][0]

                print(f"Analyzing model: {model_name} for target: {target_column}")

                # Charger le modèle et les données de test
                best_pipeline = joblib.load(model_path)
                test_file_path = os.path.join(os.path.dirname(model_path).replace(output_folder, base_path), 'application_test.csv')
                test_data = pd.read_csv(test_file_path, chunksize=chunk_size)

                for chunk in test_data:
                    if target_column not in chunk.columns:
                        continue

                    X_test = chunk.drop(target_column, axis=1)
                    y_test = chunk[target_column]

                    # Prédictions
                    y_pred = best_pipeline.predict(X_test)
                    y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]

                    # Calcul des métriques de performance
                    accuracy = accuracy_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred)
                    roc_auc = roc_auc_score(y_test, y_pred_proba)

                    performance_metrics.append({
                        'model': model_name,
                        'target': target_column,
                        'accuracy': accuracy,
                        'precision': precision,
                        'recall': recall,
                        'f1_score': f1,
                        'roc_auc': roc_auc
                    })

                    print(f"Accuracy: {accuracy}")
                    print(f"Precision: {precision}")
                    print(f"Recall: {recall}")
                    print(f"F1 Score: {f1}")
                    print(f"ROC AUC: {roc_auc}")

                    # Visualisation des résultats
                    plot_roc_curve(y_test, y_pred_proba, model_name, root)
                    plot_confusion_matrix(y_test, y_pred, model_name, root)

    # Convertir les métriques de performance en DataFrame
    performance_df = pd.DataFrame(performance_metrics)
    performance_df.to_csv(os.path.join(output_folder, 'model_performance_metrics.csv'), index=False)

    return performance_df

def plot_roc_curve(y_test, y_pred_proba, model_name, output_dir):
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(output_dir, f'roc_curve_{model_name}.png'))
    plt.close()

def plot_confusion_matrix(y_test, y_pred, model_name, output_dir):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_{model_name}.png'))
    plt.close()

# Sélection du Modèle Final et Hyperparameter Tuning

In [None]:
def select_and_tune_best_model(performance_df, base_path, output_folder, target_column):
    best_model_row = performance_df.loc[performance_df['roc_auc'].idxmax()]
    best_model_name = best_model_row['model']
    print(f"Best model: {best_model_name} with ROC AUC: {best_model_row['roc_auc']}")

    model_path = os.path.join(output_folder, best_model_name, target_column, f'best_{best_model_name}_model.pkl')
    best_pipeline = joblib.load(model_path)

    # Tuning des hyperparamètres du meilleur modèle avec Optuna
    X_train = pd.read_csv(os.path.join(base_path, 'application_train.csv')).drop(target_column, axis=1)
    y_train = pd.read_csv(os.path.join(base_path, 'application_train.csv'))[target_column]

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100)

    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial

    print('  Value: ', trial.value)
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    best_classifier = models[best_model_name]['model']
    best_params = {k: v for k, v in trial.params.items() if k != 'classifier'}
    best_classifier.set_params(**best_params)

    best_pipeline = Pipeline(steps=[
        ('classifier', best_classifier)
    ])

    best_pipeline.fit(X_train, y_train)
    joblib.dump(best_pipeline, model_path)
    print(f"Model saved at {model_path}")

    return best_pipeline

# Interprétabilité et Explicabilité

In [None]:
def explain_model(best_pipeline, X_train, output_folder):
    explainer = shap.TreeExplainer(best_pipeline.named_steps['classifier'])
    shap_values = explainer.shap_values(X_train)

    shap.summary_plot(shap_values, X_train)
    plt.savefig(os.path.join(output_folder, 'shap_summary_plot.png'))
    plt.close()

    shap.initjs()
    shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
    plt.savefig(os.path.join(output_folder, 'shap_force_plot.png'))
    plt.close()