In [None]:
# %% [markdown]
# # ML Baseline Paris - Application des 7 M√©thodes du Cours
# 
# **Objectif** : Tester les 7 m√©thodes sur le dataset Paris pr√©-trait√©
# 
# Les donn√©es sont d√©j√† :
# - Nettoy√©es et normalis√©es
# - Encod√©es (one-hot, amenities, etc.)
# - Avec target_class cr√©√©e (quartiles de prix)

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample
import umap.umap_ as umap
import warnings
warnings.filterwarnings('ignore')

# Configuration
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

print("‚úÖ Biblioth√®ques import√©es")

# %% [markdown]
# ## 1. Chargement des Donn√©es (D√©j√† pr√©-trait√©es)

# %%
# Chargement du dataset final (d√©j√† normalis√© et encod√©)
df = pd.read_csv('dataset_paris_processed.csv.gz', compression='gzip')

print(f"üìä Dataset charg√© : {df.shape}")
print(f"\nüîç Aper√ßu des colonnes :")
print(df.columns.tolist())

# V√©rification target
if 'target_class' in df.columns:
    print(f"\n‚úÖ Target trouv√©e : {df['target_class'].nunique()} classes")
    print(df['target_class'].value_counts().sort_index())
else:
    print("‚ö†Ô∏è Attention : 'target_class' non trouv√©e dans le dataset")

# Aper√ßu
print(f"\nüìã Aper√ßu des 5 premi√®res lignes :")
print(df.head())

# %% [markdown]
# ## 2. Pr√©paration X/y et Train/Test Split

# %%
print("=" * 60)
print("PR√âPARATION : S√©paration X/y et Train/Test Split")
print("=" * 60)

# S√©paration X (features) et y (target)
y = df['target_class']
X = df.drop(columns=['target_class'])

# Supprimer city_label si pr√©sent
if 'city_label' in X.columns:
    X = X.drop(columns=['city_label'])

print(f"\nFeatures (X) : {X.shape[1]} colonnes")
print(f"Target (y) : {len(y)} valeurs")
print(f"\nDistribution des classes :")
print(y.value_counts().sort_index())

# Split 80/20 stratifi√©
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

print(f"\n‚úÖ Split effectu√© :")
print(f"  Train : {X_train.shape}")
print(f"  Test  : {X_test.shape}")
print(f"\nDistribution Train :")
print(y_train.value_counts().sort_index())
print(f"\nDistribution Test :")
print(y_test.value_counts().sort_index())

# %% [markdown]
# ## 3. M√âTHODE 1 : PCA - Analyse Exploratoire

# %%
print("\n" + "=" * 60)
print("M√âTHODE 1 : PCA (Principal Component Analysis)")
print("=" * 60)

# Standardisation pour PCA (donn√©es d√©j√† normalis√©es mais on re-standardise pour PCA)
scaler_pca = StandardScaler()
X_train_std = scaler_pca.fit_transform(X_train)
X_test_std = scaler_pca.transform(X_test)

# PCA compl√®te
pca_full = PCA()
pca_full.fit(X_train_std)

# Variance expliqu√©e
var_exp = pca_full.explained_variance_ratio_
var_cum = np.cumsum(var_exp)

# Nombre de composantes
n_comp_95 = np.argmax(var_cum >= 0.95) + 1
n_comp_90 = np.argmax(var_cum >= 0.90) + 1

print(f"\nüìä R√©sultats PCA :")
print(f"  ‚Ä¢ Composantes pour 90% variance : {n_comp_90}/{X.shape[1]}")
print(f"  ‚Ä¢ Composantes pour 95% variance : {n_comp_95}/{X.shape[1]}")
print(f"  ‚Ä¢ R√©duction possible : {(1 - n_comp_95/X.shape[1])*100:.1f}%")

# Visualisations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Scree plot
axes[0].bar(range(1, min(21, len(var_exp)+1)), var_exp[:20], alpha=0.7, edgecolor='black')
axes[0].set_title("Scree Plot - Variance par Composante")
axes[0].set_xlabel("Composante")
axes[0].set_ylabel("Variance Expliqu√©e")
axes[0].grid(alpha=0.3)

# 2. Variance cumul√©e
axes[1].plot(range(1, len(var_cum)+1), var_cum, marker='o', markersize=3)
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95%')
axes[1].axhline(y=0.90, color='orange', linestyle='--', label='90%')
axes[1].axvline(x=n_comp_95, color='r', linestyle=':', alpha=0.5)
axes[1].set_title("Variance Cumul√©e")
axes[1].set_xlabel("Nombre de Composantes")
axes[1].set_ylabel("Variance Cumul√©e")
axes[1].legend()
axes[1].grid(alpha=0.3)

# 3. Projection 2D
pca_2d = PCA(n_components=2)
X_train_pca_2d = pca_2d.fit_transform(X_train_std)
scatter = axes[2].scatter(X_train_pca_2d[:, 0], X_train_pca_2d[:, 1], 
                         c=y_train, cmap='viridis', s=5, alpha=0.5)
axes[2].set_title(f"Projection 2D Train (var={pca_2d.explained_variance_ratio_.sum():.1%})")
axes[2].set_xlabel(f"PC1 ({pca_2d.explained_variance_ratio_[0]:.1%})")
axes[2].set_ylabel(f"PC2 ({pca_2d.explained_variance_ratio_[1]:.1%})")
plt.colorbar(scatter, ax=axes[2], label='Classe')

plt.tight_layout()
plt.show()

# Feature importance PC1
pc1_loadings = pd.DataFrame({
    'feature': X.columns,
    'loading': np.abs(pca_full.components_[0])
}).sort_values('loading', ascending=False)

print(f"\nüìå Top 10 Features PC1 :")
print(pc1_loadings.head(10).to_string(index=False))

# %% [markdown]
# ## 4. M√âTHODE 2 : UMAP - Visualisation Non-Lin√©aire

# %%
print("\n" + "=" * 60)
print("M√âTHODE 2 : UMAP (Uniform Manifold Approximation)")
print("=" * 60)

# UMAP sur train (sous-√©chantillon si trop volumineux)
sample_size = min(10000, len(X_train))
if len(X_train) > sample_size:
    print(f"‚ö†Ô∏è Sous-√©chantillonnage pour UMAP : {sample_size} samples")
    indices = np.random.choice(len(X_train), sample_size, replace=False)
    X_train_sample = X_train_std[indices]
    y_train_sample = y_train.iloc[indices]
else:
    X_train_sample = X_train_std
    y_train_sample = y_train

print("üîÑ Calcul UMAP (30-60 secondes)...")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42, verbose=False)
embedding_train = reducer.fit_transform(X_train_sample)

# Visualisations
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Color√© par classe
colors_class = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']
for classe in range(4):
    mask = y_train_sample == classe
    axes[0].scatter(embedding_train[mask, 0], embedding_train[mask, 1], 
                   c=colors_class[classe], label=f'Classe {classe}', s=5, alpha=0.6)
axes[0].set_title("UMAP Train - Color√© par Classe")
axes[0].set_xlabel("UMAP 1")
axes[0].set_ylabel("UMAP 2")
axes[0].legend()

# Densit√©
axes[1].hexbin(embedding_train[:, 0], embedding_train[:, 1], gridsize=30, cmap='YlOrRd')
axes[1].set_title("UMAP - Densit√©")
axes[1].set_xlabel("UMAP 1")
axes[1].set_ylabel("UMAP 2")

plt.tight_layout()
plt.show()

print("‚úÖ UMAP r√©v√®le la structure non-lin√©aire des donn√©es")

# %% [markdown]
# ## 5. M√âTHODE 3 : LDA - Linear Discriminant Analysis

# %%
print("\n" + "=" * 60)
print("M√âTHODE 3 : LDA (Linear Discriminant Analysis)")
print("=" * 60)

# LDA (max 3 composantes pour 4 classes)
lda = LinearDiscriminantAnalysis(n_components=3)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

print(f"Dimensions r√©duites : {X_train.shape[1]} ‚Üí {X_train_lda.shape[1]}")
print(f"Variance expliqu√©e : {lda.explained_variance_ratio_}")

# Visualisation 2D
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for ax, (X_lda, y_set, title) in zip(axes, 
    [(X_train_lda, y_train, 'Train'), (X_test_lda, y_test, 'Test')]):
    
    for classe in range(4):
        mask = y_set == classe
        ax.scatter(X_lda[mask, 0], X_lda[mask, 1], 
                  c=colors_class[classe], label=f'Classe {classe}', alpha=0.5, s=10)
    
    ax.set_title(f"LDA - {title}")
    ax.set_xlabel(f"LD1 ({lda.explained_variance_ratio_[0]:.1%})")
    ax.set_ylabel(f"LD2 ({lda.explained_variance_ratio_[1]:.1%})")
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úÖ LDA trouve les axes discriminants")

# %% [markdown]
# ## 6. M√âTHODE 4 : CART - Classification Tree

# %%
print("\n" + "=" * 60)
print("M√âTHODE 4 : CART (Classification Tree)")
print("=" * 60)

# Arbre simple
cart = DecisionTreeClassifier(
    max_depth=5, 
    min_samples_split=100,
    min_samples_leaf=50,
    random_state=42
)

print("üå≥ Entra√Ænement CART...")
cart.fit(X_train, y_train)

# Pr√©dictions
y_pred_cart = cart.predict(X_test)

# M√©triques
print("\nüìä Performance CART :")
print(classification_report(y_test, y_pred_cart, 
                          target_names=['Bas', 'Moyen-Bas', 'Moyen-Haut', 'Haut'],
                          zero_division=0))

# Matrice de confusion
cm_cart = confusion_matrix(y_test, y_pred_cart)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_cart, annot=True, fmt='d', cmap='Blues', 
           xticklabels=['Bas', 'MB', 'MH', 'Haut'],
           yticklabels=['Bas', 'MB', 'MH', 'Haut'])
plt.title("Matrice de Confusion - CART")
plt.ylabel("R√©el")
plt.xlabel("Pr√©dit")
plt.tight_layout()
plt.show()

# Feature Importance
importance_cart = pd.DataFrame({
    'feature': X.columns,
    'importance': cart.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìå Top 15 Features CART :")
print(importance_cart.head(15).to_string(index=False))

plt.figure(figsize=(10, 6))
importance_cart.head(15).plot(kind='barh', x='feature', y='importance', legend=False)
plt.title("Feature Importance - CART")
plt.xlabel("Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Visualisation arbre
plt.figure(figsize=(25, 12))
plot_tree(cart, feature_names=X.columns, class_names=['0','1','2','3'],
         filled=True, fontsize=9, max_depth=3)
plt.title("Arbre de D√©cision (3 niveaux)")
plt.tight_layout()
plt.show()

acc_cart = accuracy_score(y_test, y_pred_cart)
print(f"\nüéØ Accuracy CART : {acc_cart:.4f}")

# %% [markdown]
# ## 7. M√âTHODE 5 : Bootstrap - Estimation de Variance

# %%
print("\n" + "=" * 60)
print("M√âTHODE 5 : Bootstrap (Estimation de Variance)")
print("=" * 60)

n_iterations = 100
scores_bootstrap = []

print(f"üîÑ {n_iterations} it√©rations bootstrap...")

for i in range(n_iterations):
    X_boot, y_boot = resample(X_train, y_train, random_state=i)
    cart_boot = DecisionTreeClassifier(max_depth=5, min_samples_split=100, random_state=42)
    cart_boot.fit(X_boot, y_boot)
    y_pred = cart_boot.predict(X_test)
    scores_bootstrap.append(accuracy_score(y_test, y_pred))
    
    if (i+1) % 20 == 0:
        print(f"  ‚úì {i+1}/{n_iterations}")

# Statistiques
mean_acc = np.mean(scores_bootstrap)
std_acc = np.std(scores_bootstrap)
ci_low = np.percentile(scores_bootstrap, 2.5)
ci_high = np.percentile(scores_bootstrap, 97.5)

print(f"\nüìä R√©sultats Bootstrap :")
print(f"  ‚Ä¢ Accuracy : {mean_acc:.4f} ¬± {std_acc:.4f}")
print(f"  ‚Ä¢ IC 95%   : [{ci_low:.4f}, {ci_high:.4f}]")
print(f"  ‚Ä¢ Min/Max  : {min(scores_bootstrap):.4f} / {max(scores_bootstrap):.4f}")

# Visualisation
plt.figure(figsize=(10, 5))
plt.hist(scores_bootstrap, bins=30, edgecolor='black', alpha=0.7, color='skyblue')
plt.axvline(mean_acc, color='red', linestyle='--', linewidth=2, label=f'Moyenne: {mean_acc:.4f}')
plt.axvline(ci_low, color='orange', linestyle=':', linewidth=2, label='IC 95%')
plt.axvline(ci_high, color='orange', linestyle=':', linewidth=2)
plt.title(f"Distribution Bootstrap ({n_iterations} it√©rations)")
plt.xlabel("Accuracy")
plt.ylabel("Fr√©quence")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("‚úÖ Bootstrap estime la stabilit√© du mod√®le")

# %% [markdown]
# ## 8. M√âTHODE 6 : Bagging

# %%
print("\n" + "=" * 60)
print("M√âTHODE 6 : Bagging")
print("=" * 60)

bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    max_samples=0.8,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)

print("üîÑ Entra√Ænement Bagging (50 arbres)...")
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)

print("\nüìä Performance Bagging :")
print(classification_report(y_test, y_pred_bagging,
                          target_names=['Bas', 'Moyen-Bas', 'Moyen-Haut', 'Haut'],
                          zero_division=0))

cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_bagging, annot=True, fmt='d', cmap='Greens',
           xticklabels=['Bas', 'MB', 'MH', 'Haut'],
           yticklabels=['Bas', 'MB', 'MH', 'Haut'])
plt.title("Matrice de Confusion - Bagging")
plt.ylabel("R√©el")
plt.xlabel("Pr√©dit")
plt.tight_layout()
plt.show()

acc_bagging = accuracy_score(y_test, y_pred_bagging)
print(f"\nüéØ Accuracy Bagging : {acc_bagging:.4f}")
print(f"üìà Gain vs CART : {(acc_bagging - acc_cart)*100:+.2f} points")

# %% [markdown]
# ## 9. M√âTHODE 7 : Random Forest

# %%
print("\n" + "=" * 60)
print("M√âTHODE 7 : Random Forest")
print("=" * 60)

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

print("üîÑ Entra√Ænement Random Forest (100 arbres)...")
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nüìä Performance Random Forest :")
print(classification_report(y_test, y_pred_rf,
                          target_names=['Bas', 'Moyen-Bas', 'Moyen-Haut', 'Haut'],
                          zero_division=0))

cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Oranges',
           xticklabels=['Bas', 'MB', 'MH', 'Haut'],
           yticklabels=['Bas', 'MB', 'MH', 'Haut'])
plt.title("Matrice de Confusion - Random Forest")
plt.ylabel("R√©el")
plt.xlabel("Pr√©dit")
plt.tight_layout()
plt.show()

# Feature Importance
importance_rf = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìå Top 20 Features Random Forest :")
print(importance_rf.head(20).to_string(index=False))

plt.figure(figsize=(10, 8))
importance_rf.head(20).plot(kind='barh', x='feature', y='importance', legend=False, color='coral')
plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nüéØ Accuracy Random Forest : {acc_rf:.4f}")
print(f"üìà Gain vs CART : {(acc_rf - acc_cart)*100:+.2f} points")

# %% [markdown]
# ## 10. Comparaison Finale

# %%
print("\n" + "=" * 60)
print("COMPARAISON FINALE")
print("=" * 60)

results = pd.DataFrame({
    'Mod√®le': ['CART', 'Bootstrap (CART)', 'Bagging', 'Random Forest'],
    'Accuracy': [acc_cart, mean_acc, acc_bagging, acc_rf],
    'Std': ['-', f'¬±{std_acc:.4f}', '-', '-'],
    'Gain vs CART (%)': [0, (mean_acc - acc_cart)*100, (acc_bagging - acc_cart)*100, (acc_rf - acc_cart)*100]
})

print("\n" + results.to_string(index=False))

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors_models = ['#3498db', '#e74c3c', '#f39c12', '#2ecc71']
bars = axes[0].bar(results['Mod√®le'], results['Accuracy'], color=colors_models, alpha=0.7, edgecolor='black')
axes[0].axhline(y=0.25, color='gray', linestyle='--', alpha=0.5, label='Al√©atoire')
axes[0].set_title("Comparaison des Performances", fontsize=14, fontweight='bold')
axes[0].set_ylabel("Accuracy")
axes[0].set_ylim(0, 1)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=15, ha='right')

for bar, acc in zip(bars, results['Accuracy']):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{acc:.4f}', ha='center', fontweight='bold')

axes[1].plot(results['Mod√®le'], results['Gain vs CART (%)'], marker='o', linewidth=2, markersize=10, color='green')
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_title("Gains vs CART", fontsize=14, fontweight='bold')
axes[1].set_ylabel("Gain (%)")
axes[1].grid(alpha=0.3)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=15, ha='right')

plt.tight_layout()
plt.show()

print(f"\nüèÜ Meilleur mod√®le : {results.loc[results['Accuracy'].idxmax(), 'Mod√®le']}")
print(f"‚úÖ Baseline Paris √©tablie !")

# %% [markdown]
# ## 11. Sauvegarde

# %%
results.to_csv('results_baseline_paris.csv', index=False)
import joblib
joblib.dump(rf, 'model_rf_paris.pkl')
print("üíæ R√©sultats et mod√®le sauvegard√©s")