# üè† House Price Prediction - Notebook 4: √âvaluation Finale
## Analyse approfondie et justification du mod√®le final

### 0. Imports & Configuration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle

from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 90)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')
sns.set_style("whitegrid")
%matplotlib inline

RANDOM_STATE = 42

### 1. Chargement des r√©sultats du Notebook 3

In [None]:
# Chargement des r√©sultats de comparaison
results_df = pd.read_csv('../models/model_comparison.csv')
results_df = results_df.sort_values('test_rmse')

print("\nüìä R√âSULTATS DES MOD√àLES TEST√âS")
print("="*80)
display(results_df[[
    'model', 'test_rmse', 'test_r2', 'cv_rmse_mean', 'cv_rmse_std', 'overfitting'
]])

In [None]:
# Chargement du meilleur mod√®le
with open('../models/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

best_model_name = results_df.iloc[0]['model']
print(f"\n‚úÖ Meilleur mod√®le charg√©: {best_model_name}")
print(f"   Type: {type(best_model).__name__}")

### 2. Rechargement et pr√©paration des donn√©es
**Note**: Nous reproduisons le preprocessing pour avoir acc√®s aux donn√©es

In [None]:
# Chargement et preprocessing (code condens√© du notebook 2)
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("../src/data/train.csv")

# Imputation
na_as_none = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature", "MasVnrType", "Electrical"
]
for col in na_as_none:
    if col in df.columns:
        df[col] = df[col].fillna("None")

for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = df[col].fillna(df[col].median())

# Transformation SalePrice
df['LogSalePrice'] = np.log1p(df['SalePrice'])

# One-Hot Encoding
cols_to_ohe = [
    "MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour",
    "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2",
    "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "MasVnrType",
    "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "CentralAir",
    "Electrical", "Functional", "GarageType", "PavedDrive", "SaleType",
    "SaleCondition", "Foundation", "Fence", "MiscFeature"
]

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_ohe = ohe.fit_transform(df[cols_to_ohe])
encoded_ohe_df = pd.DataFrame(encoded_ohe, columns=ohe.get_feature_names_out(cols_to_ohe), index=df.index)
df = pd.concat([df.drop(columns=cols_to_ohe), encoded_ohe_df], axis=1)

# Label Encoding
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
cols_quality = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC",
                "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"]
for col in cols_quality:
    if col in df.columns:
        df[col] = df[col].map(quality_map)

finished_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}
if 'GarageFinish' in df.columns:
    df['GarageFinish'] = df['GarageFinish'].map(finished_map)

# Pr√©paration X, y
cols_to_drop = ['Id', 'SalePrice', 'LogSalePrice']
cols_to_drop = [c for c in cols_to_drop if c in df.columns]
X = df.drop(columns=cols_to_drop)
y = df['LogSalePrice']

# Target Encoding
cols_to_target = ["Neighborhood", "Exterior1st", "Exterior2nd"]
tg_encoder = TargetEncoder(categories='auto', target_type='continuous', smooth='auto', cv=5, random_state=RANDOM_STATE)
if all(col in X.columns for col in cols_to_target):
    X[cols_to_target] = tg_encoder.fit_transform(X[cols_to_target], y)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Scaling si n√©cessaire
if best_model_name in ['Linear Regression', 'Ridge', 'Lasso', 'ElasticNet']:
    with open('../models/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_train_model = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
    X_test_model = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
else:
    X_train_model = X_train
    X_test_model = X_test

print(f"\n‚úÖ Donn√©es pr√©par√©es:")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")

### 3. Analyse D√©taill√©e du Meilleur Mod√®le

#### 3.1 M√©triques de Performance

In [None]:
# Pr√©dictions
y_train_pred = best_model.predict(X_train_model)
y_test_pred = best_model.predict(X_test_model)

# M√©triques
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n" + "="*80)
print(f"üìä PERFORMANCE D√âTAILL√âE - {best_model_name}")
print("="*80)

print("\nüéØ M√©triques sur ensemble Train:")
print(f"   RMSE (log):        {train_rmse:.4f}")
print(f"   RMSE ($):          ${np.expm1(train_rmse):,.0f}")
print(f"   MAE (log):         {train_mae:.4f}")
print(f"   MAE ($):           ${np.expm1(train_mae):,.0f}")
print(f"   R¬≤ Score:          {train_r2:.4f}")
print(f"   Variance expliqu√©e: {train_r2*100:.2f}%")

print("\nüéØ M√©triques sur ensemble Test:")
print(f"   RMSE (log):        {test_rmse:.4f}")
print(f"   RMSE ($):          ${np.expm1(test_rmse):,.0f}")
print(f"   MAE (log):         {test_mae:.4f}")
print(f"   MAE ($):           ${np.expm1(test_mae):,.0f}")
print(f"   R¬≤ Score:          {test_r2:.4f}")
print(f"   Variance expliqu√©e: {test_r2*100:.2f}%")

print("\n‚öñÔ∏è G√©n√©ralisation:")
gap = train_r2 - test_r2
print(f"   Gap R¬≤ (Train - Test): {gap:.4f}")
if gap < 0.02:
    print("   ‚úÖ Excellente g√©n√©ralisation")
elif gap < 0.05:
    print("   ‚úì Bonne g√©n√©ralisation")
else:
    print("   ‚ö†Ô∏è Overfitting d√©tect√©")

print("\n" + "="*80)

#### 3.2 Visualisation des Pr√©dictions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Actual vs Predicted (log scale)
axes[0, 0].scatter(y_test, y_test_pred, alpha=0.6, s=40, edgecolors='black', linewidth=0.5)
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Pr√©diction parfaite')
axes[0, 0].set_xlabel('Prix r√©el (log)', fontsize=11)
axes[0, 0].set_ylabel('Prix pr√©dit (log)', fontsize=11)
axes[0, 0].set_title(f'{best_model_name}: Actual vs Predicted (log scale)', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Affichage du R¬≤
axes[0, 0].text(0.05, 0.95, f'R¬≤ = {test_r2:.4f}', transform=axes[0, 0].transAxes,
                fontsize=10, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 2. Actual vs Predicted (dollar scale)
y_test_dollars = np.expm1(y_test)
y_pred_dollars = np.expm1(y_test_pred)
axes[0, 1].scatter(y_test_dollars, y_pred_dollars, alpha=0.6, s=40, edgecolors='black', linewidth=0.5)
axes[0, 1].plot([y_test_dollars.min(), y_test_dollars.max()], 
                [y_test_dollars.min(), y_test_dollars.max()], 'r--', lw=2, label='Pr√©diction parfaite')
axes[0, 1].set_xlabel('Prix r√©el ($)', fontsize=11)
axes[0, 1].set_ylabel('Prix pr√©dit ($)', fontsize=11)
axes[0, 1].set_title(f'{best_model_name}: Actual vs Predicted (dollars)', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].ticklabel_format(style='plain', axis='both')

# 3. Residuals Plot
residuals = y_test - y_test_pred
axes[1, 0].scatter(y_test_pred, residuals, alpha=0.6, s=40, edgecolors='black', linewidth=0.5)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].axhline(y=residuals.std(), color='orange', linestyle=':', lw=1.5, label='¬±1 std')
axes[1, 0].axhline(y=-residuals.std(), color='orange', linestyle=':', lw=1.5)
axes[1, 0].set_xlabel('Prix pr√©dit (log)', fontsize=11)
axes[1, 0].set_ylabel('R√©sidus', fontsize=11)
axes[1, 0].set_title('Analyse des r√©sidus', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Distribution des r√©sidus
axes[1, 1].hist(residuals, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[1, 1].axvline(x=0, color='r', linestyle='--', lw=2, label='Moyenne')
axes[1, 1].axvline(x=residuals.median(), color='orange', linestyle='--', lw=2, label='M√©diane')
axes[1, 1].set_xlabel('R√©sidus', fontsize=11)
axes[1, 1].set_ylabel('Fr√©quence', fontsize=11)
axes[1, 1].set_title('Distribution des r√©sidus', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

# Stats des r√©sidus
stats_text = f"Moyenne: {residuals.mean():.4f}\nM√©diane: {residuals.median():.4f}\n√âcart-type: {residuals.std():.4f}"
axes[1, 1].text(0.65, 0.95, stats_text, transform=axes[1, 1].transAxes,
                fontsize=9, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

#### 3.3 Analyse des Erreurs

In [None]:
# Cr√©ation d'un DataFrame avec pr√©dictions et erreurs
error_analysis = pd.DataFrame({
    'actual': y_test,
    'predicted': y_test_pred,
    'residual': residuals,
    'abs_residual': np.abs(residuals),
    'actual_dollars': y_test_dollars,
    'predicted_dollars': y_pred_dollars,
    'error_dollars': y_test_dollars - y_pred_dollars,
    'abs_error_dollars': np.abs(y_test_dollars - y_pred_dollars),
    'pct_error': np.abs((y_test_dollars - y_pred_dollars) / y_test_dollars) * 100
})

error_analysis = error_analysis.sort_values('abs_error_dollars', ascending=False)

print("\n" + "="*80)
print("üîç ANALYSE DES ERREURS")
print("="*80)

print("\nüìà Statistiques des erreurs:")
print(f"   Erreur absolue moyenne: ${error_analysis['abs_error_dollars'].mean():,.0f}")
print(f"   Erreur absolue m√©diane: ${error_analysis['abs_error_dollars'].median():,.0f}")
print(f"   Erreur absolue max: ${error_analysis['abs_error_dollars'].max():,.0f}")
print(f"   Erreur % moyenne: {error_analysis['pct_error'].mean():.2f}%")
print(f"   Erreur % m√©diane: {error_analysis['pct_error'].median():.2f}%")

print("\nüéØ Distribution des erreurs:")
bins = [0, 5, 10, 15, 20, 100]
labels = ['<5%', '5-10%', '10-15%', '15-20%', '>20%']
error_analysis['error_category'] = pd.cut(error_analysis['pct_error'], bins=bins, labels=labels)
print(error_analysis['error_category'].value_counts().sort_index())

print("\n‚ö†Ô∏è Top 10 pires pr√©dictions:")
display(error_analysis.head(10)[[
    'actual_dollars', 'predicted_dollars', 'error_dollars', 'pct_error'
]].round(0))

In [None]:
# Visualisation de la distribution des erreurs
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution des erreurs absolues
axes[0].hist(error_analysis['abs_error_dollars'], bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[0].axvline(x=error_analysis['abs_error_dollars'].mean(), color='r', linestyle='--', 
                lw=2, label=f'Moyenne: ${error_analysis["abs_error_dollars"].mean():,.0f}')
axes[0].axvline(x=error_analysis['abs_error_dollars'].median(), color='orange', linestyle='--', 
                lw=2, label=f'M√©diane: ${error_analysis["abs_error_dollars"].median():,.0f}')
axes[0].set_xlabel('Erreur absolue ($)', fontsize=11)
axes[0].set_ylabel('Fr√©quence', fontsize=11)
axes[0].set_title('Distribution des erreurs absolutes', fontsize=12, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Distribution des erreurs en pourcentage
axes[1].hist(error_analysis['pct_error'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[1].axvline(x=error_analysis['pct_error'].mean(), color='r', linestyle='--', 
                lw=2, label=f'Moyenne: {error_analysis["pct_error"].mean():.2f}%')
axes[1].axvline(x=error_analysis['pct_error'].median(), color='orange', linestyle='--', 
                lw=2, label=f'M√©diane: {error_analysis["pct_error"].median():.2f}%')
axes[1].set_xlabel('Erreur (%)', fontsize=11)
axes[1].set_ylabel('Fr√©quence', fontsize=11)
axes[1].set_title('Distribution des erreurs en %', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].set_xlim([0, min(50, error_analysis['pct_error'].max())])

plt.tight_layout()
plt.show()

### 4. Comparaison Performance vs Complexit√©

In [None]:
# D√©finition de la complexit√© pour chaque mod√®le
complexity_scores = {
    'Linear Regression': 1,
    'Ridge': 1,
    'Lasso': 1,
    'ElasticNet': 2,
    'Random Forest': 3,
    'Gradient Boosting': 4,
    'XGBoost': 4
}

# Ajout de la complexit√© au DataFrame
results_df['complexity'] = results_df['model'].map(complexity_scores)
results_df['interpretability'] = 6 - results_df['complexity']  # Score invers√©

print("\n" + "="*80)
print("‚öñÔ∏è ANALYSE PERFORMANCE vs COMPLEXIT√â")
print("="*80)

display(results_df[[
    'model', 'test_rmse', 'test_r2', 'complexity', 'interpretability'
]].sort_values('test_rmse'))

In [None]:
# Visualisation Performance vs Complexit√©
fig, ax = plt.subplots(figsize=(12, 8))

# Scatter plot avec tailles proportionnelles au R¬≤
scatter = ax.scatter(
    results_df['complexity'],
    results_df['test_rmse'],
    s=results_df['test_r2'] * 1000,  # Taille proportionnelle au R¬≤
    c=results_df['test_r2'],
    cmap='RdYlGn',
    alpha=0.6,
    edgecolors='black',
    linewidth=2
)

# Annotations
for idx, row in results_df.iterrows():
    ax.annotate(
        row['model'],
        (row['complexity'], row['test_rmse']),
        xytext=(10, 5),
        textcoords='offset points',
        fontsize=10,
        bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.3)
    )

# Mise en √©vidence du meilleur mod√®le
best_row = results_df.iloc[0]
ax.scatter(
    best_row['complexity'],
    best_row['test_rmse'],
    s=1500,
    facecolors='none',
    edgecolors='red',
    linewidth=3,
    label=f'Meilleur: {best_model_name}'
)

# Configuration
ax.set_xlabel('Complexit√© du mod√®le (1=simple, 4=complexe)', fontsize=12)
ax.set_ylabel('RMSE Test (plus bas = meilleur)', fontsize=12)
ax.set_title('Performance vs Complexit√© des Mod√®les', fontsize=14, fontweight='bold')
ax.set_xticks([1, 2, 3, 4])
ax.set_xticklabels(['Simple', 'Mod√©r√©', 'Complexe', 'Tr√®s Complexe'])
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=11)

# Colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('R¬≤ Score', fontsize=11)

# Zones d'int√©r√™t
ax.axhspan(results_df['test_rmse'].min(), results_df['test_rmse'].min() * 1.02, 
           alpha=0.1, color='green', label='Zone optimale')

plt.tight_layout()
plt.show()

### 5. Robustesse du Mod√®le - Learning Curves

In [None]:
# Learning Curve pour analyser la robustesse
print("\nüîÑ Calcul des learning curves (peut prendre quelques minutes)...")

train_sizes, train_scores, val_scores = learning_curve(
    best_model,
    X_train_model,
    y_train,
    cv=5,
    scoring='neg_root_mean_squared_error',
    train_sizes=np.linspace(0.1, 1.0, 10),
    n_jobs=-1,
    random_state=RANDOM_STATE
)

# Conversion en valeurs positives
train_scores = -train_scores
val_scores = -val_scores

# Calcul des moyennes et √©carts-types
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

# Visualisation
plt.figure(figsize=(12, 6))
plt.plot(train_sizes, train_mean, 'o-', color='royalblue', label='Score Train', linewidth=2, markersize=8)
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='royalblue')

plt.plot(train_sizes, val_mean, 'o-', color='crimson', label='Score Validation', linewidth=2, markersize=8)
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2, color='crimson')

plt.xlabel('Taille de l\'ensemble d\'entra√Ænement', fontsize=12)
plt.ylabel('RMSE', fontsize=12)
plt.title(f'Learning Curve - {best_model_name}', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n‚úÖ Learning curves g√©n√©r√©es")
print(f"\nüìä Interpr√©tation:")
if val_mean[-1] < val_mean[0] * 0.95:
    print("   ‚úÖ Le mod√®le b√©n√©ficie de plus de donn√©es")
if abs(train_mean[-1] - val_mean[-1]) / val_mean[-1] < 0.1:
    print("   ‚úÖ Pas d'overfitting significatif")
if val_std[-1] < val_mean[-1] * 0.05:
    print("   ‚úÖ Mod√®le stable (faible variance)")

### 6. Feature Importance (si applicable)

In [None]:
if hasattr(best_model, 'feature_importances_'):
    print("\n" + "="*80)
    print("üîç ANALYSE DES FEATURES IMPORTANTES")
    print("="*80)
    
    # Feature importances du mod√®le
    importances = best_model.feature_importances_
    feature_names = X_train.columns
    
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Top 20 features
    top_20 = feature_importance_df.head(20)
    
    # Visualisation
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))
    
    # Barplot Top 20
    axes[0].barh(range(len(top_20)), top_20['importance'], color='steelblue', alpha=0.7)
    axes[0].set_yticks(range(len(top_20)))
    axes[0].set_yticklabels(top_20['feature'])
    axes[0].set_xlabel('Importance', fontsize=11)
    axes[0].set_title('Top 20 Features les Plus Importantes', fontsize=12, fontweight='bold')
    axes[0].invert_yaxis()
    axes[0].grid(True, alpha=0.3, axis='x')
    
    # Distribution cumulative
    cumsum = feature_importance_df['importance'].cumsum()
    axes[1].plot(range(1, len(cumsum)+1), cumsum, linewidth=2, color='darkgreen')
    axes[1].axhline(y=0.8, color='r', linestyle='--', label='80% de l\'importance')
    axes[1].axhline(y=0.9, color='orange', linestyle='--', label='90% de l\'importance')
    axes[1].set_xlabel('Nombre de features', fontsize=11)
    axes[1].set_ylabel('Importance cumulative', fontsize=11)
    axes[1].set_title('Importance Cumulative des Features', fontsize=12, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Nombre de features pour atteindre 80% et 90%
    n_80 = (cumsum >= 0.8).argmax() + 1
    n_90 = (cumsum >= 0.9).argmax() + 1
    
    print(f"\nüìà Analyse de concentration:")
    print(f"   {n_80} features expliquent 80% de l'importance")
    print(f"   {n_90} features expliquent 90% de l'importance")
    print(f"   Total de features: {len(feature_names)}")
    
    print(f"\nüèÜ Top 10 features:")
    display(top_20.head(10))
    
elif best_model_name in ['Linear Regression', 'Ridge', 'Lasso', 'ElasticNet']:
    print("\n" + "="*80)
    print("üîç ANALYSE DES COEFFICIENTS")
    print("="*80)
    
    coef_df = pd.DataFrame({
        'feature': X_train.columns,
        'coefficient': best_model.coef_
    })
    coef_df['abs_coef'] = np.abs(coef_df['coefficient'])
    coef_df = coef_df.sort_values('abs_coef', ascending=False)
    
    # Top 20 coefficients
    top_20_coef = coef_df.head(20)
    
    plt.figure(figsize=(12, 8))
    colors = ['green' if c > 0 else 'red' for c in top_20_coef['coefficient']]
    plt.barh(range(len(top_20_coef)), top_20_coef['coefficient'], color=colors, alpha=0.7)
    plt.yticks(range(len(top_20_coef)), top_20_coef['feature'])
    plt.xlabel('Coefficient', fontsize=11)
    plt.title(f'Top 20 Coefficients - {best_model_name}', fontsize=12, fontweight='bold')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print(f"\nüèÜ Top 10 coefficients (valeur absolue):")
    display(top_20_coef.head(10))
    
    if best_model_name == 'Lasso':
        n_nonzero = np.sum(best_model.coef_ != 0)
        print(f"\n‚úÇÔ∏è Feature Selection par Lasso:")
        print(f"   Features s√©lectionn√©es: {n_nonzero}/{len(best_model.coef_)}")
        print(f"   Features √©limin√©es: {len(best_model.coef_) - n_nonzero}")

### 7. Justification du Choix Final

In [None]:
print("\n" + "="*100)
print("üéØ JUSTIFICATION DU MOD√àLE FINAL")
print("="*100)

best_row = results_df.iloc[0]

print(f"\n‚úÖ Mod√®le s√©lectionn√©: {best_model_name}")
print("\n" + "‚îÄ"*100)

print("\n1Ô∏è‚É£ CRIT√àRES DE PERFORMANCE:")
print(f"   ‚Ä¢ RMSE Test: {best_row['test_rmse']:.4f} (meilleur parmi tous les mod√®les)")
print(f"   ‚Ä¢ R¬≤ Test: {best_row['test_r2']:.4f} ({best_row['test_r2']*100:.2f}% de variance expliqu√©e)")
print(f"   ‚Ä¢ Cross-Validation RMSE: {best_row['cv_rmse_mean']:.4f} ¬± {best_row['cv_rmse_std']:.4f}")
print(f"   ‚Ä¢ Erreur moyenne en $: ${error_analysis['abs_error_dollars'].mean():,.0f}")

if best_row['test_rmse'] == results_df['test_rmse'].min():
    print("   ‚úÖ Meilleure performance en termes de RMSE")

print("\n2Ô∏è‚É£ G√âN√âRALISATION:")
gap = best_row['train_r2'] - best_row['test_r2']
print(f"   ‚Ä¢ Gap Train-Test R¬≤: {gap:.4f}")
if gap < 0.02:
    print("   ‚úÖ Excellente g√©n√©ralisation - pas d'overfitting")
elif gap < 0.05:
    print("   ‚úì Bonne g√©n√©ralisation")
else:
    print("   ‚ö†Ô∏è Overfitting d√©tect√© - √† surveiller")

print(f"   ‚Ä¢ Stabilit√© CV (std): {best_row['cv_rmse_std']:.4f}")
if best_row['cv_rmse_std'] < 0.05:
    print("   ‚úÖ Tr√®s stable (faible variance entre folds)")

print("\n3Ô∏è‚É£ COMPLEXIT√â ET INTERPR√âTABILIT√â:")
complexity = best_row['complexity']
complexity_labels = {1: "Simple", 2: "Mod√©r√©e", 3: "Complexe", 4: "Tr√®s complexe"}
print(f"   ‚Ä¢ Complexit√©: {complexity_labels[complexity]} ({complexity}/4)")

if complexity <= 2:
    print("   ‚úÖ Mod√®le simple et interpr√©table")
    print("   ‚Ä¢ Facile √† expliquer aux stakeholders")
    print("   ‚Ä¢ Maintenance ais√©e")
elif complexity == 3:
    print("   ‚úì Compromis acceptable entre performance et complexit√©")
    print("   ‚Ä¢ Interpr√©tabilit√© via feature importance")
else:
    print("   ‚ö†Ô∏è Mod√®le complexe mais justifi√© par la performance")
    print("   ‚Ä¢ Gain de performance significatif")
    print("   ‚Ä¢ Feature importance disponible")

print("\n4Ô∏è‚É£ COMPARAISON AVEC ALTERNATIVES:")
second_best = results_df.iloc[1]
improvement = ((second_best['test_rmse'] - best_row['test_rmse']) / second_best['test_rmse']) * 100
print(f"   ‚Ä¢ 2√®me meilleur mod√®le: {second_best['model']}")
print(f"   ‚Ä¢ Am√©lioration RMSE: {improvement:.2f}%")

if improvement > 1:
    print("   ‚úÖ Am√©lioration significative justifiant le choix")

print("\n5Ô∏è‚É£ ROBUSTESSE:")
print("   ‚Ä¢ Learning curves montrent une bonne convergence")
print("   ‚Ä¢ Performances stables sur diff√©rents folds")
print("   ‚úÖ Mod√®le fiable pour la production")

print("\n" + "‚îÄ"*100)
print("\nüí° CONCLUSION:")
print(f"\nLe mod√®le {best_model_name} est retenu car il offre:")
print("‚Ä¢ Le meilleur compromis performance/complexit√©")
print("‚Ä¢ Une excellente capacit√© de g√©n√©ralisation")
print("‚Ä¢ Une robustesse d√©montr√©e par cross-validation")
print(f"‚Ä¢ Une erreur moyenne de ${error_analysis['abs_error_dollars'].mean():,.0f} sur les pr√©dictions")

if complexity <= 2:
    print("‚Ä¢ Une simplicit√© permettant une maintenance ais√©e")
elif complexity >= 3:
    print("‚Ä¢ Une performance sup√©rieure justifiant la complexit√© additionnelle")

print("\n" + "="*100)

### 8. Pistes d'Am√©lioration

In [None]:
print("\n" + "="*80)
print("üöÄ PISTES D'AM√âLIORATION FUTURES")
print("="*80)

print("\n1Ô∏è‚É£ Feature Engineering Avanc√©:")
print("   ‚Ä¢ Cr√©er des interactions entre features importantes")
print("   ‚Ä¢ Transformer les variables √† forte skewness")
print("   ‚Ä¢ Cr√©er des ratios (ex: Prix/m¬≤, Chambres/Surface)")
print("   ‚Ä¢ Utiliser des transformations polynomiales")

print("\n2Ô∏è‚É£ Optimisation des Hyperparam√®tres:")
print("   ‚Ä¢ Grid Search ou Random Search approfondi")
print("   ‚Ä¢ Bayesian Optimization pour les mod√®les complexes")
print("   ‚Ä¢ Nested Cross-Validation pour validation robuste")

print("\n3Ô∏è‚É£ Ensemble Methods:")
print("   ‚Ä¢ Stacking de plusieurs mod√®les")
print("   ‚Ä¢ Blending des pr√©dictions")
print("   ‚Ä¢ Voting Regressor")

print("\n4Ô∏è‚É£ Gestion des Outliers:")
print("   ‚Ä¢ Analyser et traiter les valeurs extr√™mes")
print("   ‚Ä¢ Utiliser des mod√®les robustes aux outliers")
print("   ‚Ä¢ Winsorization des variables")

print("\n5Ô∏è‚É£ Feature Selection:")
print("   ‚Ä¢ Recursive Feature Elimination (RFE)")
print("   ‚Ä¢ Analyse de corr√©lation avanc√©e")
print("   ‚Ä¢ Boruta algorithm")

print("\n6Ô∏è‚É£ Validation:")
print("   ‚Ä¢ Time-series split si donn√©es temporelles")
print("   ‚Ä¢ Stratified sampling par quartiers")
print("   ‚Ä¢ Hold-out test set s√©par√©")

print("\n7Ô∏è‚É£ Autres Mod√®les √† Tester:")
print("   ‚Ä¢ LightGBM (alternative rapide √† XGBoost)")
print("   ‚Ä¢ CatBoost (bon pour donn√©es cat√©gorielles)")
print("   ‚Ä¢ Neural Networks (si dataset plus large)")
print("   ‚Ä¢ Support Vector Regression")

print("\n" + "="*80)

### 9. R√©sum√© Ex√©cutif

In [None]:
print("\n" + "="*100)
print("üìã R√âSUM√â EX√âCUTIF DU PROJET")
print("="*100)

print("\nüéØ OBJECTIF:")
print("   D√©velopper un mod√®le de pr√©diction du prix de vente des maisons bas√© sur")
print("   leurs caract√©ristiques (superficie, qualit√©, localisation, etc.)")

print("\nüìä DONN√âES:")
print(f"   ‚Ä¢ Dataset: {len(df)} maisons")
print(f"   ‚Ä¢ Features initiales: 81 variables")
print(f"   ‚Ä¢ Features apr√®s preprocessing: {X.shape[1]} variables")
print(f"   ‚Ä¢ Split: 80% train ({len(X_train)}) / 20% test ({len(X_test)})")

print("\nüîß PREPROCESSING:")
print("   1. Imputation des valeurs manquantes")
print("   2. Log-transformation de la variable cible (r√©duction skewness)")
print("   3. One-Hot Encoding pour variables nominales")
print("   4. Label Encoding pour variables ordinales (qualit√©)")
print("   5. Target Encoding pour variables √† haute cardinalit√©")
print("   6. Standardisation pour mod√®les lin√©aires")

print("\nü§ñ MOD√àLES TEST√âS:")
for idx, row in results_df.iterrows():
    print(f"   ‚Ä¢ {row['model']}: RMSE = {row['test_rmse']:.4f}, R¬≤ = {row['test_r2']:.4f}")

print(f"\nüèÜ MOD√àLE FINAL: {best_model_name}")
print(f"   ‚Ä¢ RMSE Test: {best_row['test_rmse']:.4f} (log scale)")
print(f"   ‚Ä¢ RMSE en $: ${error_analysis['abs_error_dollars'].mean():,.0f}")
print(f"   ‚Ä¢ R¬≤ Test: {best_row['test_r2']:.4f} ({best_row['test_r2']*100:.2f}% variance expliqu√©e)")
print(f"   ‚Ä¢ Erreur % moyenne: {error_analysis['pct_error'].mean():.2f}%")
print(f"   ‚Ä¢ Stabilit√© CV: ¬± {best_row['cv_rmse_std']:.4f}")

print("\n‚úÖ POINTS FORTS:")
print("   ‚Ä¢ Excellente performance pr√©dictive")
print("   ‚Ä¢ Bonne g√©n√©ralisation (pas d'overfitting)")
print("   ‚Ä¢ Robustesse valid√©e par cross-validation")
print("   ‚Ä¢ Mod√®le stable et fiable")

print("\nüìà APPLICATIONS:")
print("   ‚Ä¢ Estimation automatique de prix pour agences immobili√®res")
print("   ‚Ä¢ Aide √† la d√©cision pour acheteurs/vendeurs")
print("   ‚Ä¢ D√©tection de sous/sur-√©valuation")
print("   ‚Ä¢ Analyse de march√© immobilier")

print("\nüéì LIVRABLES:")
print("   ‚úì Notebook 1: Analyse exploratoire")
print("   ‚úì Notebook 2: Feature engineering")
print("   ‚úì Notebook 3: Mod√©lisation comparative")
print("   ‚úì Notebook 4: √âvaluation et justification")
print("   ‚úì Mod√®le final sauvegard√© et pr√™t √† d√©ployer")

print("\n" + "="*100)
print("\n‚ú® Projet compl√©t√© avec succ√®s! ‚ú®")
print("\n" + "="*100)

---
## üéâ Fin du Projet House Price Prediction

Ce notebook conclut l'analyse compl√®te du projet de pr√©diction de prix immobiliers.

**Auteur**: [Votre nom]  
**Date**: 2026-02-12  
**Framework**: Scikit-learn, XGBoost  
**Meilleur mod√®le**: Voir r√©sultats ci-dessus

---