In [1]:
import sys
import os

# Ajouter le dossier parent de 'src' au chemin Python
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


In [2]:
# Cellule 1 - IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.data_loader import DataLoader
from src.features.feature_engineer import FeatureEngineer
from src.models.model_trainer import ModelTrainer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

# Configuration
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("🤖 Entraînement des Modèles - VERSION CORRIGÉE")

🤖 Entraînement des Modèles - VERSION CORRIGÉE


In [3]:
# Dans une cellule de votre notebook, vérifiez les méthodes disponibles
fe = FeatureEngineer()
print("Méthodes disponibles:")
for method in dir(fe):
    if not method.startswith('_'):  # Ne pas afficher les méthodes privées
        print(f"  - {method}")

Méthodes disponibles:
  - clean_data
  - create_features
  - fit
  - fit_transform
  - get_metadata_routing
  - get_params
  - scaler
  - set_output
  - set_params
  - transform


In [4]:
# Cellule 2 - CHARGEMENT ET PRÉPARATION CORRIGÉE
loader = DataLoader()
df = loader.load_raw_data()

fe = FeatureEngineer()

# ⚠️ CORRECTION : utilisez create_features() et clean_data()
print("1. Création des features...")
df_engineered = fe.create_features(df)  # Bonne méthode

print("2. Nettoyage des données...")
df_clean = fe.clean_data(df_engineered)  # Nettoyage supplémentaire

print("3. Préparation pour ML...")
X = df_clean.drop('Class', axis=1)
y = df_clean['Class']

# Vérification des NaN
print(f"   NaN après clean_data: {X.isnull().sum().sum()}")

# Sécurité supplémentaire
X = X.select_dtypes(include=[np.number])
X = X.dropna(axis=1)

print(f"✅ Dimensions finales: X{X.shape}, y{y.shape}")

 Données chargées: 100000 transactions
1. Création des features...
2. Nettoyage des données...
3. Préparation pour ML...
   NaN après clean_data: 0
✅ Dimensions finales: X(100000, 38), y(100000,)


In [5]:
# Cellule 3 - SPLIT ET SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Après split train/test:")
print(f"   X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"   X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"   Fraudes dans y_train: {y_train.sum()}, y_test: {y_test.sum()}")

# Application de SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"🔄 Après SMOTE:")
print(f"   X_train_res: {X_train_res.shape}")
print(f"   y_train_res: {y_train_res.shape}")
print(f"   Distribution: {pd.Series(y_train_res).value_counts().to_dict()}")

📊 Après split train/test:
   X_train: (80000, 38), y_train: (80000,)
   X_test: (20000, 38), y_test: (20000,)
   Fraudes dans y_train: 136, y_test: 34




🔄 Après SMOTE:
   X_train_res: (159728, 38)
   y_train_res: (159728,)
   Distribution: {0: 79864, 1: 79864}


In [6]:
# Cellule 4 - ENTRAÎNEMENT DES MODÈLES
trainer = ModelTrainer()
results = trainer.train_models(X_train_res, y_train_res, X_test, y_test)

best_model, best_score = trainer.get_best_model()
print(f"🎯 MEILLEUR MODÈLE: AUC = {best_score:.4f}")

🔧 Entraînement de logistic_regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ logistic_regression - AUC: 0.9803
🔧 Entraînement de random_forest...
✅ random_forest - AUC: 0.9998
🔧 Entraînement de xgboost...
✅ xgboost - AUC: 0.9999
🔧 Entraînement de lightgbm...
[LightGBM] [Info] Number of positive: 79864, number of negative: 79864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9206
[LightGBM] [Info] Number of data points in the train set: 159728, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
✅ lightgbm - AUC: 0.9999
🎯 MEILLEUR MODÈLE: AUC = 0.9999


In [7]:
# Cellule 5 - Évaluation détaillée
from sklearn.metrics import precision_recall_curve

# Prédictions du meilleur modèle
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("📊 RAPPORT DE CLASSIFICATION:")
print(classification_report(y_test, y_pred))

print("\n🎯 MATRICE DE CONFUSION:")
print(confusion_matrix(y_test, y_pred))

📊 RAPPORT DE CLASSIFICATION:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19966
           1       0.84      0.94      0.89        34

    accuracy                           1.00     20000
   macro avg       0.92      0.97      0.94     20000
weighted avg       1.00      1.00      1.00     20000


🎯 MATRICE DE CONFUSION:
[[19960     6]
 [    2    32]]


In [8]:
import joblib
import datetime

# Créer le dossier models s'il n'existe pas
import os
os.makedirs('models', exist_ok=True)

# Nom du fichier avec timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f'models/fraud_detector_{timestamp}.pkl'

# Enregistrement du modèle
joblib.dump(best_model, model_filename)
print(f"✅ Modèle enregistré: {model_filename}")

# Enregistrement aussi du feature engineer et des métadonnées
pipeline = {
    'model': best_model,
    'feature_engineer': fe,
    'model_type': type(best_model).__name__,
    'auc_score': best_score,
    'timestamp': timestamp,
    'features_used': X.columns.tolist()
}

pipeline_filename = f'models/fraud_pipeline_{timestamp}.pkl'
joblib.dump(pipeline, pipeline_filename)
print(f"✅ Pipeline complet enregistré: {pipeline_filename}")

✅ Modèle enregistré: models/fraud_detector_20250927_210157.pkl
✅ Pipeline complet enregistré: models/fraud_pipeline_20250927_210157.pkl
