# Projet

## Prérequis

In [6]:

import sys
import os
import importlib
from ydata_profiling import ProfileReport
import pandas as pd

PROJECT_PATH = os.getcwd()
SRC_PATH = os.path.join(PROJECT_PATH, "src")

if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

print("Chemin du projet :", PROJECT_PATH)
print("Chemin du dossier src :", SRC_PATH)

import explore_data
import preprocess

importlib.reload(explore_data)
importlib.reload(preprocess)

from explore_data import * 
from preprocess import * 

Chemin du projet : c:\Users\marct\Documents\GIF-7005\Yasmine\Projet
Chemin du dossier src : c:\Users\marct\Documents\GIF-7005\Yasmine\Projet\src


## Prétraitement des données

In [None]:
process_and_save_all(PROJECT_PATH, windows=["FM12"], segments=["red"])

## Importation des données prétraitées

In [3]:
data_train = load_processed_data(PROJECT_PATH, windows=["FM12"], segments=['red'], splits=["train"])
X_train, y_train = data_train.drop(columns=['DFlag']),data_train['DFlag']
data_test = load_processed_data(PROJECT_PATH, windows=["FM12"], segments=['red'], splits=["OOS"])
X_test, y_test = data_test.drop(columns=['DFlag']), data_test['DFlag']
oot_test = load_processed_data(PROJECT_PATH, windows=["FM12"], segments=['red'], splits=["OOT"])
X_oot_test, y_oot_test = oot_test.drop(columns=['DFlag']), oot_test['DFlag']
oou_test = load_processed_data(PROJECT_PATH, windows=["FM12"], segments=['red'], splits=["OOU"])
X_oou_test, y_oou_test = oou_test.drop(columns=['DFlag']), oou_test['DFlag']

## Exploration des données

In [None]:
from explore_data import *

save_path = os.path.join(PROJECT_PATH, "outputs", "exploration", "rapport_FM12.html")

data_to_explore = load_processed_data(PROJECT_PATH, windows=["FM12"], segments=['red'])
summarize_data_to_html(data_to_explore, "FM12 - Rapport", save_path)

get_drift(data_train, data_test, oot_test, oou_test, "outputs/drift.png")

({'Train': np.float64(1.0876450641946582),
  'OOS': np.float64(1.1049952235423532),
  'OOT': np.float64(1.3444518838638992),
  'OOU': np.float64(4.954330475065752)},
 {'Train–OOS': 0, 'Train–OOT': 5, 'Train–OOU': 2},
 {'Train–OOS': 0, 'Train–OOT': 2, 'Train–OOU': 2})

### T-SNE
Cette image a été généré à partir du notebook tsne.ipynb

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

tsne_path = os.path.join(PROJECT_PATH, "outputs", "exploration", "tsne.png")
tsne_img = mpimg.imread(tsne_path)

plt.figure(figsize=(8,6))
plt.imshow(tsne_img)
plt.axis("off")
plt.show()

### Dérive des données

In [None]:
drift_path = os.path.join(PROJECT_PATH, "outputs", "drift")

run_drift_reports(
    data_train=data_train,
    data_oos=data_test,
    data_oot=oot_test,
    data_oou=oou_test,
    output_path=drift_path
)

## Entraînement et évaluation des modèles

In [23]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, roc_curve

def train_and_eval(model, model_name):
    model.fit(X_train, y_train)
    results = []

    for name, (X_eval, y_eval) in {
        'OOS': (X_test, y_test),
        'OOT': (X_oot_test, y_oot_test),
        'OOU': (X_oou_test, y_oou_test)
    }.items():

        y_proba = model.predict_proba(X_eval)[:, 1]

        # Discrimination
        auc = roc_auc_score(y_eval, y_proba)
        gini = 2 * auc - 1
        pr_auc = average_precision_score(y_eval, y_proba)

        # Calibration
        brier = brier_score_loss(y_eval, y_proba)

        # KS
        fpr, tpr, _ = roc_curve(y_eval, y_proba)
        ks = max(tpr - fpr)

        results.append({
            'Model': model_name,
            'Dataset': name,
            'Gini': gini,
            'PR-AUC': pr_auc,
            'KS': ks,
            'Brier': brier
        })

    return results


all_results = []

### Régression logistique

In [24]:
log_reg = LogisticRegression(
    C=0.5, penalty="l2", class_weight="balanced",
    max_iter=3000, random_state=42
)

log_reg_model_cal = CalibratedClassifierCV(log_reg, method='isotonic', cv=3)

all_results.extend(train_and_eval(log_reg_model_cal, "LogisticRegression"))

### Random Forest

In [25]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

rf_model = BalancedRandomForestClassifier(
    n_estimators=100,
    sampling_strategy="all",
    replacement=True,
    max_depth=12,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1,
    bootstrap=True
)

rf_model_cal = CalibratedClassifierCV(rf_model, method='isotonic', cv=3)

all_results.extend(train_and_eval(rf_model_cal, "BalancedRandomForest"))

### XGBoost

In [26]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_model_cal = CalibratedClassifierCV(xgb_model, method='isotonic', cv=3)

all_results.extend(train_and_eval(xgb_model_cal, "XGBoost"))

## Résultats

In [27]:
import pandas as pd

# DataFrame des résultats
results_df = pd.DataFrame(all_results).drop_duplicates()

# Pivot pour avoir chaque métrique par dataset
pivot_df = results_df.pivot(index="Model", columns="Dataset", values=["Gini", "PR-AUC", "Brier"])
pivot_df.columns = [f"{metric}_{ds}" for metric, ds in pivot_df.columns]
pivot_df = pivot_df.reset_index()

# DataFrame pour robustesse (écarts entre OOS et les autres datasets)
robustness = pivot_df.copy()  # On conserve toutes les performances

# Gini : écarts OOS vs OOT/OOU
robustness["Gini_Drop_OOT"] = pivot_df["Gini_OOS"] - pivot_df["Gini_OOT"]
robustness["Gini_Drop_OOU"] = pivot_df["Gini_OOS"] - pivot_df["Gini_OOU"]

# PR-AUC : écarts OOS vs OOT/OOU
robustness["PR_Drop_OOT"] = pivot_df["PR-AUC_OOS"] - pivot_df["PR-AUC_OOT"]
robustness["PR_Drop_OOU"] = pivot_df["PR-AUC_OOS"] - pivot_df["PR-AUC_OOU"]

print("\n=== Performances des modèles par jeu de données ===\n")
print(pivot_df)

print("\n=== Écarts entre OOS et les autres jeux de données ===\n")
print(robustness[["Model", "Gini_Drop_OOT", "Gini_Drop_OOU", "PR_Drop_OOT", "PR_Drop_OOU"]])


=== Performances des modèles par jeu de données ===

                  Model  Gini_OOS  Gini_OOT  Gini_OOU  PR-AUC_OOS  PR-AUC_OOT  \
0  BalancedRandomForest  0.394950  0.376980  0.228824    0.026458    0.027134   
1    LogisticRegression  0.416448  0.405617  0.252156    0.027661    0.030461   
2               XGBoost  0.392643  0.374856  0.230267    0.026758    0.027575   

   PR-AUC_OOU  Brier_OOS  Brier_OOT  Brier_OOU  
0    0.071104   0.010855   0.013185   0.048298  
1    0.075736   0.010852   0.013165   0.048225  
2    0.070842   0.010856   0.013184   0.048307  

=== Écarts entre OOS et les autres jeux de données ===

                  Model  Gini_Drop_OOT  Gini_Drop_OOU  PR_Drop_OOT  \
0  BalancedRandomForest       0.017970       0.166126    -0.000677   
1    LogisticRegression       0.010831       0.164292    -0.002799   
2               XGBoost       0.017787       0.162376    -0.000818   

   PR_Drop_OOU  
0    -0.044646  
1    -0.048074  
2    -0.044084  
