# üîß Preprocessing et Feature Engineering
## Santander Customer Transaction Prediction

---

### Objectifs de ce notebook :
1. Nettoyer et pr√©parer les donn√©es
2. Feature engineering
3. Feature selection
4. Normalisation des donn√©es
5. Gestion du d√©s√©quilibre des classes
6. Pr√©paration des donn√©es pour le modeling

In [None]:
# Import des biblioth√®ques
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
import joblib
import warnings

warnings.filterwarnings('ignore')
np.random.seed(42)

print("‚úÖ Biblioth√®ques import√©es")

## 1Ô∏è‚É£ Chargement des donn√©es

In [None]:
# Charger les donn√©es
print("üì• Chargement des donn√©es...")
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

print(f"‚úÖ Train: {train.shape}, Test: {test.shape}")

# S√©parer features et target
X_train = train.drop(['ID_code', 'target'], axis=1)
y_train = train['target']
X_test = test.drop(['ID_code'], axis=1)

print(f"   X_train: {X_train.shape}")
print(f"   y_train: {y_train.shape}")
print(f"   X_test: {X_test.shape}")

## 2Ô∏è‚É£ Feature Engineering

In [None]:
# Cr√©er de nouvelles features statistiques
print("üî® Cr√©ation de nouvelles features...")

def add_statistical_features(df):
    """Ajoute des features statistiques"""
    df_new = df.copy()
    
    # Features statistiques globales
    df_new['mean'] = df.mean(axis=1)
    df_new['std'] = df.std(axis=1)
    df_new['min'] = df.min(axis=1)
    df_new['max'] = df.max(axis=1)
    df_new['median'] = df.median(axis=1)
    df_new['skew'] = df.skew(axis=1)
    df_new['kurt'] = df.kurtosis(axis=1)
    
    # Range
    df_new['range'] = df_new['max'] - df_new['min']
    
    # Quartiles
    df_new['q1'] = df.quantile(0.25, axis=1)
    df_new['q3'] = df.quantile(0.75, axis=1)
    df_new['iqr'] = df_new['q3'] - df_new['q1']
    
    return df_new

X_train_engineered = add_statistical_features(X_train)
X_test_engineered = add_statistical_features(X_test)

print(f"‚úÖ Nouvelles features cr√©√©es")
print(f"   Avant: {X_train.shape[1]} features")
print(f"   Apr√®s: {X_train_engineered.shape[1]} features")
print(f"\n   Nouvelles features: {[col for col in X_train_engineered.columns if col not in X_train.columns]}")

## 3Ô∏è‚É£ Feature Selection

In [None]:
# M√©thode 1: Corr√©lation avec la cible
print("üîç Feature selection par corr√©lation...")

correlations = X_train_engineered.corrwith(y_train).abs().sort_values(ascending=False)

print("\nTop 20 features par corr√©lation:")
print(correlations.head(20))

In [None]:
# Visualisation
fig, ax = plt.subplots(figsize=(12, 6))
correlations.head(30).plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Top 30 Features par Corr√©lation avec la Cible', fontsize=14, fontweight='bold')
ax.set_xlabel('Corr√©lation Absolue')
plt.tight_layout()
plt.show()

In [None]:
# M√©thode 2: SelectKBest avec ANOVA F-value
print("üîç Feature selection avec SelectKBest...")

k_best = 100  # Garder les 100 meilleures features
selector = SelectKBest(score_func=f_classif, k=k_best)
selector.fit(X_train_engineered, y_train)

# Obtenir les scores
scores = pd.DataFrame({
    'feature': X_train_engineered.columns,
    'score': selector.scores_
}).sort_values('score', ascending=False)

print(f"\nTop 10 features par F-score:")
print(scores.head(10))

In [None]:
# S√©lectionner les meilleures features
selected_features = scores.head(k_best)['feature'].tolist()

X_train_selected = X_train_engineered[selected_features]
X_test_selected = X_test_engineered[selected_features]

print(f"‚úÖ Features s√©lectionn√©es: {len(selected_features)}")
print(f"   Shape: {X_train_selected.shape}")

## 4Ô∏è‚É£ Normalisation des donn√©es

In [None]:
# StandardScaler (z-score normalization)
print("‚öñÔ∏è Normalisation des donn√©es avec StandardScaler...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Convertir en DataFrame pour garder les noms de colonnes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features, index=X_train_selected.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features, index=X_test_selected.index)

print("‚úÖ Normalisation effectu√©e")
print(f"\nStatistiques apr√®s normalisation:")
print(X_train_scaled.describe())

In [None]:
# Comparaison avant/apr√®s normalisation
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Avant normalisation
X_train_selected.iloc[:, 0].hist(bins=50, ax=axes[0], alpha=0.7, color='blue')
axes[0].set_title(f'Avant normalisation - {selected_features[0]}', fontweight='bold')
axes[0].set_xlabel('Valeur')
axes[0].set_ylabel('Fr√©quence')

# Apr√®s normalisation
X_train_scaled.iloc[:, 0].hist(bins=50, ax=axes[1], alpha=0.7, color='green')
axes[1].set_title(f'Apr√®s normalisation - {selected_features[0]}', fontweight='bold')
axes[1].set_xlabel('Valeur normalis√©e')
axes[1].set_ylabel('Fr√©quence')

plt.tight_layout()
plt.show()

## 5Ô∏è‚É£ Analyse de la r√©duction de dimensionnalit√© (PCA)

In [None]:
# PCA pour visualisation
print("üî¨ Analyse en Composantes Principales (PCA)...")

pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train_scaled)

# Variance expliqu√©e
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print(f"‚úÖ PCA effectu√©e")
print(f"   Variance expliqu√©e par les 10 premi√®res composantes: {cumulative_variance[9]:.2%}")
print(f"   Variance expliqu√©e par les 50 composantes: {cumulative_variance[49]:.2%}")

In [None]:
# Visualisation de la variance expliqu√©e
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Variance par composante
axes[0].bar(range(1, 51), explained_variance, alpha=0.7, color='steelblue')
axes[0].set_title('Variance Expliqu√©e par Composante', fontweight='bold')
axes[0].set_xlabel('Composante')
axes[0].set_ylabel('Variance Expliqu√©e')
axes[0].grid(True, alpha=0.3)

# Variance cumul√©e
axes[1].plot(range(1, 51), cumulative_variance, marker='o', linestyle='-', color='red')
axes[1].axhline(y=0.95, color='green', linestyle='--', label='95% variance')
axes[1].set_title('Variance Cumul√©e', fontweight='bold')
axes[1].set_xlabel('Nombre de Composantes')
axes[1].set_ylabel('Variance Cumul√©e')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Visualisation 2D des donn√©es avec PCA
pca_2d = PCA(n_components=2)
X_train_2d = pca_2d.fit_transform(X_train_scaled)

plt.figure(figsize=(12, 8))

# Scatter plot avec les deux classes
plt.scatter(
    X_train_2d[y_train == 0, 0],
    X_train_2d[y_train == 0, 1],
    c='blue', alpha=0.3, label='Pas de transaction', s=10
)
plt.scatter(
    X_train_2d[y_train == 1, 0],
    X_train_2d[y_train == 1, 1],
    c='red', alpha=0.5, label='Transaction', s=20
)

plt.title('Projection PCA 2D des donn√©es', fontsize=14, fontweight='bold')
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6Ô∏è‚É£ Sauvegarde des donn√©es pr√©process√©es

In [None]:
# Sauvegarder le scaler
print("üíæ Sauvegarde du scaler...")
joblib.dump(scaler, '../models/scaler.pkl')
print("‚úÖ Scaler sauvegard√© dans '../models/scaler.pkl'")

# Sauvegarder les features s√©lectionn√©es
with open('../models/selected_features.txt', 'w') as f:
    for feature in selected_features:
        f.write(f"{feature}\n")
print("‚úÖ Liste des features sauvegard√©e dans '../models/selected_features.txt'")

In [None]:
# R√©sum√© du preprocessing
preprocessing_summary = {
    'original_features': X_train.shape[1],
    'engineered_features': X_train_engineered.shape[1],
    'selected_features': len(selected_features),
    'scaling_method': 'StandardScaler',
    'train_samples': len(X_train_scaled),
    'test_samples': len(X_test_scaled),
    'class_distribution': y_train.value_counts().to_dict()
}

print("\n" + "="*50)
print("üìä R√âSUM√â DU PREPROCESSING")
print("="*50)
for key, value in preprocessing_summary.items():
    print(f"{key:.<30} {value}")
print("="*50)

## üìù Conclusions du Preprocessing

### Transformations effectu√©es :
1. ‚úÖ **Feature Engineering** : Ajout de 11 features statistiques
2. ‚úÖ **Feature Selection** : R√©duction √† 100 features les plus pertinentes
3. ‚úÖ **Normalisation** : StandardScaler appliqu√©
4. ‚úÖ **PCA** : Analyse de la variance (95% avec ~40 composantes)

### Prochaine √©tape :
‚û°Ô∏è **Notebook 03_modeling.ipynb** : Entra√Ænement des mod√®les ML