In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Charger les donn√©es
df = pd.read_csv('fraudTrain.csv')

# 2. Undersampling initial (optionnel)
fraud_df = df[df['is_fraud'] == 1]
non_fraud_df = df[df['is_fraud'] == 0]
sampled_non_fraud = non_fraud_df.sample(n=50000, random_state=42)
balanced_df = pd.concat([sampled_non_fraud, fraud_df], ignore_index=True)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 3. Pr√©parer X et y
y = balanced_df['is_fraud']
X = balanced_df.drop('is_fraud', axis=1)

# üîç V√âRIFIER LES COLONNES - Exclure les colonnes suspectes
colonnes_a_exclure = [
    'trans_num',           # ID de transaction
    'unix_time',           # Peut leaker l'info
    'trans_date_trans_time', # Timestamp exact
    # Ajoute ici toute colonne suspecte
]

# Garder seulement les colonnes num√©riques
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# Exclure les colonnes suspectes
numeric_cols = [col for col in numeric_cols if col not in colonnes_a_exclure]

X_numeric = X[numeric_cols]

print(f"Colonnes utilis√©es pour l'entra√Ænement : {numeric_cols}")
print(f"Shape avant split : {X_numeric.shape}")

# ‚úÖ 4. SPLIT TRAIN/TEST D'ABORD (AVANT SMOTE !)
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTrain shape : {X_train.shape}")
print(f"Test shape : {X_test.shape}")
print(f"Distribution train :\n{y_train.value_counts()}")
print(f"Distribution test :\n{y_test.value_counts()}")

# ‚úÖ 5. SMOTE SEULEMENT SUR LE TRAIN SET
smote = SMOTE(sampling_strategy=0.8, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nApr√®s SMOTE - Train shape : {X_train_resampled.shape}")
print(f"Distribution train apr√®s SMOTE :\n{pd.Series(y_train_resampled).value_counts()}")

# 6. Entra√Æner le mod√®le
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,      # Limite la profondeur pour √©viter l'overfitting
    min_samples_split=20,
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled)

# 7. √âvaluer sur TRAIN (pour voir l'overfitting)
train_accuracy = model.score(X_train_resampled, y_train_resampled)
print(f"\nüìä Accuracy sur TRAIN : {train_accuracy:.4f}")

# 8. √âvaluer sur TEST (la vraie performance)
test_accuracy = model.score(X_test, y_test)
print(f"üìä Accuracy sur TEST : {test_accuracy:.4f}")

# 9. Rapport d√©taill√©
y_pred = model.predict(X_test)
print("\n" + "="*50)
print("RAPPORT DE CLASSIFICATION (TEST SET) :")
print("="*50)
print(classification_report(y_test, y_pred))

print("\nMATRICE DE CONFUSION :")
print(confusion_matrix(y_test, y_pred))

# 10. V√©rifier l'importance des features
feature_importance = pd.DataFrame({
    'feature': numeric_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìà TOP 10 FEATURES LES PLUS IMPORTANTES :")
print(feature_importance.head(10))

# üö® ALERTE si accuracy train >> test
if train_accuracy - test_accuracy > 0.1:
    print("\n‚ö†Ô∏è  ATTENTION : OVERFITTING D√âTECT√â !")
    print(f"   Diff√©rence train-test : {train_accuracy - test_accuracy:.4f}")
    print("   ‚Üí R√©duire max_depth ou augmenter min_samples_split")

Colonnes utilis√©es pour l'entra√Ænement : ['Unnamed: 0', 'cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']
Shape avant split : (57506, 9)

Train shape : (46004, 9)
Test shape : (11502, 9)
Distribution train :
is_fraud
0    39999
1     6005
Name: count, dtype: int64
Distribution test :
is_fraud
0    10001
1     1501
Name: count, dtype: int64

Apr√®s SMOTE - Train shape : (71998, 9)
Distribution train apr√®s SMOTE :
is_fraud
0    39999
1    31999
Name: count, dtype: int64

üìä Accuracy sur TRAIN : 0.9032
üìä Accuracy sur TEST : 0.9357

RAPPORT DE CLASSIFICATION (TEST SET) :
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10001
           1       0.76      0.75      0.75      1501

    accuracy                           0.94     11502
   macro avg       0.86      0.86      0.86     11502
weighted avg       0.94      0.94      0.94     11502


MATRICE DE CONFUSION :
[[9638  363]
 [ 377 1124]]

üìà TOP 10

In [13]:
# Sauvegarder le dataset final
df.to_csv('fraudTrain_balanced_smote.csv', index=False)
print(f"‚úÖ Dataset sauvegard√© : {df.shape[0]} lignes, {df.shape[1]} colonnes")
print(f"üìä Distribution finale : \n{df['is_fraud'].value_counts()}")

‚úÖ Dataset sauvegard√© : 70000 lignes, 11 colonnes
üìä Distribution finale : 
is_fraud
0    50000
1    20000
Name: count, dtype: int64
