# ü§ñ Modeling et √âvaluation
## Santander Customer Transaction Prediction

---

### Objectifs de ce notebook :
1. Entra√Æner plusieurs mod√®les de classification
2. Comparer les performances
3. Optimiser les hyperparam√®tres
4. G√©rer le d√©s√©quilibre des classes
5. √âvaluer avec des m√©triques appropri√©es
6. Sauvegarder le meilleur mod√®le

In [None]:
# Import des biblioth√®ques
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Mod√®les
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')
np.random.seed(42)

print("‚úÖ Biblioth√®ques import√©es")

## 1Ô∏è‚É£ Pr√©paration des donn√©es

In [None]:
# Charger les donn√©es pr√©process√©es du notebook pr√©c√©dent
print("üì• Chargement des donn√©es...")

# Recharger les donn√©es et refaire le preprocessing
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# S√©parer X et y
X = train.drop(['ID_code', 'target'], axis=1)
y = train['target']
X_test_final = test.drop(['ID_code'], axis=1)

print(f"‚úÖ Donn√©es charg√©es: X={X.shape}, y={y.shape}")

# Split train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nüìä Split effectu√©:")
print(f"   Train: {X_train.shape}")
print(f"   Validation: {X_val.shape}")
print(f"\n   Distribution train: {y_train.value_counts().to_dict()}")
print(f"   Distribution val: {y_val.value_counts().to_dict()}")

## 2Ô∏è‚É£ Baseline: Logistic Regression

In [None]:
print("üîß Entra√Ænement du mod√®le Baseline: Logistic Regression...")

lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr_model.fit(X_train, y_train)

# Pr√©dictions
y_pred_lr = lr_model.predict(X_val)
y_pred_proba_lr = lr_model.predict_proba(X_val)[:, 1]

# √âvaluation
print("\nüìä R√©sultats Logistic Regression:")
print(f"   Accuracy: {accuracy_score(y_val, y_pred_lr):.4f}")
print(f"   Precision: {precision_score(y_val, y_pred_lr):.4f}")
print(f"   Recall: {recall_score(y_val, y_pred_lr):.4f}")
print(f"   F1-Score: {f1_score(y_val, y_pred_lr):.4f}")
print(f"   ROC-AUC: {roc_auc_score(y_val, y_pred_proba_lr):.4f}")

## 3Ô∏è‚É£ Random Forest

In [None]:
print("üå≤ Entra√Ænement Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Pr√©dictions
y_pred_rf = rf_model.predict(X_val)
y_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# √âvaluation
print("\nüìä R√©sultats Random Forest:")
print(f"   Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")
print(f"   Precision: {precision_score(y_val, y_pred_rf):.4f}")
print(f"   Recall: {recall_score(y_val, y_pred_rf):.4f}")
print(f"   F1-Score: {f1_score(y_val, y_pred_rf):.4f}")
print(f"   ROC-AUC: {roc_auc_score(y_val, y_pred_proba_rf):.4f}")

## 4Ô∏è‚É£ XGBoost

In [None]:
print("üöÄ Entra√Ænement XGBoost...")

# Calculer scale_pos_weight pour g√©rer le d√©s√©quilibre
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"   scale_pos_weight: {scale_pos_weight:.2f}")

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_model.fit(X_train, y_train)

# Pr√©dictions
y_pred_xgb = xgb_model.predict(X_val)
y_pred_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]

# √âvaluation
print("\nüìä R√©sultats XGBoost:")
print(f"   Accuracy: {accuracy_score(y_val, y_pred_xgb):.4f}")
print(f"   Precision: {precision_score(y_val, y_pred_xgb):.4f}")
print(f"   Recall: {recall_score(y_val, y_pred_xgb):.4f}")
print(f"   F1-Score: {f1_score(y_val, y_pred_xgb):.4f}")
print(f"   ROC-AUC: {roc_auc_score(y_val, y_pred_proba_xgb):.4f}")

## 5Ô∏è‚É£ LightGBM

In [None]:
print("üí° Entra√Ænement LightGBM...")

lgbm_model = LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42,
    verbose=-1
)
lgbm_model.fit(X_train, y_train)

# Pr√©dictions
y_pred_lgbm = lgbm_model.predict(X_val)
y_pred_proba_lgbm = lgbm_model.predict_proba(X_val)[:, 1]

# √âvaluation
print("\nüìä R√©sultats LightGBM:")
print(f"   Accuracy: {accuracy_score(y_val, y_pred_lgbm):.4f}")
print(f"   Precision: {precision_score(y_val, y_pred_lgbm):.4f}")
print(f"   Recall: {recall_score(y_val, y_pred_lgbm):.4f}")
print(f"   F1-Score: {f1_score(y_val, y_pred_lgbm):.4f}")
print(f"   ROC-AUC: {roc_auc_score(y_val, y_pred_proba_lgbm):.4f}")

## 6Ô∏è‚É£ Comparaison des mod√®les

In [None]:
# Cr√©er un DataFrame de comparaison
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM'],
    'Accuracy': [
        accuracy_score(y_val, y_pred_lr),
        accuracy_score(y_val, y_pred_rf),
        accuracy_score(y_val, y_pred_xgb),
        accuracy_score(y_val, y_pred_lgbm)
    ],
    'Precision': [
        precision_score(y_val, y_pred_lr),
        precision_score(y_val, y_pred_rf),
        precision_score(y_val, y_pred_xgb),
        precision_score(y_val, y_pred_lgbm)
    ],
    'Recall': [
        recall_score(y_val, y_pred_lr),
        recall_score(y_val, y_pred_rf),
        recall_score(y_val, y_pred_xgb),
        recall_score(y_val, y_pred_lgbm)
    ],
    'F1-Score': [
        f1_score(y_val, y_pred_lr),
        f1_score(y_val, y_pred_rf),
        f1_score(y_val, y_pred_xgb),
        f1_score(y_val, y_pred_lgbm)
    ],
    'ROC-AUC': [
        roc_auc_score(y_val, y_pred_proba_lr),
        roc_auc_score(y_val, y_pred_proba_rf),
        roc_auc_score(y_val, y_pred_proba_xgb),
        roc_auc_score(y_val, y_pred_proba_lgbm)
    ]
})

print("\n" + "="*70)
print("üìä COMPARAISON DES MOD√àLES")
print("="*70)
print(results.to_string(index=False))
print("="*70)

In [None]:
# Visualisation des performances
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']

for i, metric in enumerate(metrics):
    results.plot(x='Model', y=metric, kind='bar', ax=axes[i], color=colors, legend=False)
    axes[i].set_title(f'{metric}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('')
    axes[i].set_ylabel(metric)
    axes[i].set_xticklabels(results['Model'], rotation=45, ha='right')
    axes[i].grid(True, alpha=0.3)

# Comparaison globale
results_melted = results.melt(id_vars='Model', var_name='Metric', value_name='Score')
pivot_data = results_melted.pivot(index='Metric', columns='Model', values='Score')

sns.heatmap(pivot_data, annot=True, fmt='.3f', cmap='RdYlGn', ax=axes[5], cbar_kws={'label': 'Score'})
axes[5].set_title('Heatmap des Performances', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 7Ô∏è‚É£ Courbes ROC

In [None]:
# Courbes ROC pour tous les mod√®les
plt.figure(figsize=(10, 8))

models_proba = [
    ('Logistic Regression', y_pred_proba_lr),
    ('Random Forest', y_pred_proba_rf),
    ('XGBoost', y_pred_proba_xgb),
    ('LightGBM', y_pred_proba_lgbm)
]

for name, y_proba in models_proba:
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    auc = roc_auc_score(y_val, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=2)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Courbes ROC - Comparaison des Mod√®les', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8Ô∏è‚É£ Matrices de confusion

In [None]:
# Matrices de confusion
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

predictions = [
    ('Logistic Regression', y_pred_lr),
    ('Random Forest', y_pred_rf),
    ('XGBoost', y_pred_xgb),
    ('LightGBM', y_pred_lgbm)
]

for i, (name, y_pred) in enumerate(predictions):
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar=False)
    axes[i].set_title(f'Matrice de Confusion - {name}', fontweight='bold')
    axes[i].set_xlabel('Pr√©diction')
    axes[i].set_ylabel('R√©alit√©')
    axes[i].set_xticklabels(['Pas de transaction', 'Transaction'])
    axes[i].set_yticklabels(['Pas de transaction', 'Transaction'])

plt.tight_layout()
plt.show()

## 9Ô∏è‚É£ Feature Importance (XGBoost)

In [None]:
# Feature importance pour XGBoost
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüîù Top 20 features les plus importantes (XGBoost):")
print(feature_importance.head(20))

# Visualisation
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(range(len(top_20)), top_20['importance'], color='steelblue')
plt.yticks(range(len(top_20)), top_20['feature'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Features par Importance (XGBoost)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## üîü S√©lection et sauvegarde du meilleur mod√®le

In [None]:
# S√©lectionner le meilleur mod√®le bas√© sur ROC-AUC
best_model_idx = results['ROC-AUC'].idxmax()
best_model_name = results.loc[best_model_idx, 'Model']
best_auc = results.loc[best_model_idx, 'ROC-AUC']

print(f"\nüèÜ Meilleur mod√®le: {best_model_name}")
print(f"   ROC-AUC: {best_auc:.4f}")

# Mapper le nom au mod√®le
model_mapping = {
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model
}

best_model = model_mapping[best_model_name]

# Sauvegarder le meilleur mod√®le
print("\nüíæ Sauvegarde du meilleur mod√®le...")
joblib.dump(best_model, '../models/best_model.pkl')
print("‚úÖ Mod√®le sauvegard√© dans '../models/best_model.pkl'")

# Sauvegarder les r√©sultats
results.to_csv('../models/model_comparison.csv', index=False)
print("‚úÖ R√©sultats sauvegard√©s dans '../models/model_comparison.csv'")

## üìù Rapport de classification final

In [None]:
# Rapport d√©taill√© pour le meilleur mod√®le
best_pred = best_model.predict(X_val)

print(f"\nüìä RAPPORT DE CLASSIFICATION - {best_model_name}")
print("="*70)
print(classification_report(y_val, best_pred, target_names=['Pas de transaction', 'Transaction']))
print("="*70)

## üìù Conclusions

### R√©sum√© :
1. ‚úÖ **4 mod√®les entra√Æn√©s** : Logistic Regression, Random Forest, XGBoost, LightGBM
2. ‚úÖ **Meilleur mod√®le identifi√©** bas√© sur ROC-AUC
3. ‚úÖ **Mod√®le sauvegard√©** pour l'API
4. ‚úÖ **Gestion du d√©s√©quilibre** via class_weight et scale_pos_weight

### Prochaines √©tapes :
‚û°Ô∏è **Phase 2** : Cr√©er l'API Flask (`api/app.py`)
‚û°Ô∏è **Phase 3** : D√©velopper l'interface Streamlit (`frontend/streamlit_app.py`)