In [2]:
import sys
import os

# Ajouter le dossier parent de 'src' au chemin Python
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


In [3]:
# Cellule 1 - Imports
import pandas as pd
import numpy as np
from src.data.data_loader import DataLoader
from src.features.feature_engineer import FeatureEngineer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

print("🔧 Feature Engineering")

🔧 Feature Engineering


In [4]:
# Cellule 2 - Chargement et feature engineering
loader = DataLoader()
df = loader.load_raw_data()

# Application du feature engineering
fe = FeatureEngineer()
df_engineered = fe.create_features(df)

print(f"✅ Features originales: {df.shape[1]}")
print(f"✅ Features après engineering: {df_engineered.shape[1]}")
print(f"📊 Nouvelles colonnes: {[col for col in df_engineered.columns if col not in df.columns]}")

 Données chargées: 100000 transactions
✅ Features originales: 31
✅ Features après engineering: 40
📊 Nouvelles colonnes: ['hour_of_day', 'is_night', 'amount_log', 'amount_category', 'V1_amount_interaction', 'V2_amount_interaction', 'V3_amount_interaction', 'V4_amount_interaction', 'V5_amount_interaction']


In [5]:
# Cellule 3 - Préparation des données pour le ML
X = df_engineered.drop('Class', axis=1)
y = df_engineered['Class']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📈 Données après split:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"🎯 Fraudes dans train: {y_train.sum()}, test: {y_test.sum()}")

📈 Données après split:
X_train: (80000, 39), y_train: (80000,)
X_test: (20000, 39), y_test: (20000,)
🎯 Fraudes dans train: 136, test: 34


In [None]:
# Cellule 4 - Gestion du déséquilibre avec SMOTE
print("🔄 Application de SMOTE pour équilibrer les classes...")

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"✅ Après SMOTE:")
print(f"X_train_res: {X_train_res.shape}")
print(f"y_train_res: {y_train_res.shape}")
print(f"Distribution: {pd.Series(y_train_res).value_counts().to_dict()}")