In [1]:
"""
================================================================================
PROJET UNIVERSITAIRE: PR√âDICTION DES PRIX IMMOBILIERS EN TUNISIE
================================================================================
Auteur: [Votre Nom]
Date: Novembre 2024
Cours: Machine Learning / Data Science

Description:
    Ce projet compare 5 mod√®les de r√©gression non-lin√©aires pour pr√©dire
    les prix de l'immobilier en Tunisie, avec optimisation des hyperparam√®tres
    et interface de pr√©diction.
    
    Note: Utilise des features sans data leakage (prix m√©dian par zone calcul√©
    sur train set uniquement lors de la cross-validation).
================================================================================
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import warnings
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Mod√®les
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

warnings.filterwarnings('ignore')
os.makedirs('../output', exist_ok=True)

print("=" * 100)
print("PARTIE 1: FONDEMENTS TH√âORIQUES DES MOD√àLES")
print("=" * 100)

models_theory = {
    "Decision Tree": {
        "description": """
        üå≥ ARBRE DE D√âCISION (Decision Tree)
        
        Principe:
        - Divise r√©cursivement l'espace des features en r√©gions rectangulaires
        - √Ä chaque n≈ìud, choisit la meilleure variable et seuil pour minimiser l'erreur
        - Pr√©diction = moyenne des valeurs dans chaque feuille
        
        Avantages:
        ‚úì Interpr√©table et visualisable
        ‚úì Capture les relations non-lin√©aires
        ‚úì Pas besoin de normalisation
        ‚úì Tr√®s rapide
        
        Inconv√©nients:
        ‚úó Tendance au sur-apprentissage
        ‚úó Instable (petits changements = grands impacts)
        ‚úó Moins performant que les ensembles
        
        Hyperparam√®tres cl√©s:
        ‚Ä¢ max_depth: Profondeur maximale (contr√¥le la complexit√©)
        ‚Ä¢ min_samples_split: Minimum d'√©chantillons pour diviser un n≈ìud
        ‚Ä¢ min_samples_leaf: Minimum d'√©chantillons par feuille
        """,
        
        "param_grid": {
            'max_depth': [10, 15, 20, 25],
            'min_samples_split': [10, 20, 30],
            'min_samples_leaf': [5, 10, 15]
        }
    },
    
    "Random Forest": {
        "description": """
        üå≤ FOR√äT AL√âATOIRE (Random Forest)
        
        Principe:
        - Ensemble de nombreux arbres de d√©cision
        - Chaque arbre entra√Æn√© sur un √©chantillon al√©atoire (bootstrap)
        - Pr√©diction = moyenne des pr√©dictions de tous les arbres
        - "Wisdom of the crowd" r√©duit la variance
        
        Avantages:
        ‚úì Tr√®s robuste au sur-apprentissage
        ‚úì G√®re bien les donn√©es bruit√©es
        ‚úì Fournit l'importance des features
        ‚úì Parall√©lisable (rapide avec multi-threading)
        
        Inconv√©nients:
        ‚úó Moins interpr√©table qu'un seul arbre
        ‚úó Plus lent que les arbres simples
        ‚úó Consomme plus de m√©moire
        
        Hyperparam√®tres cl√©s:
        ‚Ä¢ n_estimators: Nombre d'arbres (plus = meilleur mais plus lent)
        ‚Ä¢ max_depth: Profondeur des arbres
        ‚Ä¢ max_features: Nombre de features consid√©r√©es par split
        ‚Ä¢ min_samples_leaf: R√©gularisation
        """,
        
        "param_grid": {
            'n_estimators': [100, 200, 300],
            'max_depth': [15, 20, 25],
            'min_samples_split': [5, 10, 15],
            'min_samples_leaf': [2, 4, 6],
            'max_features': ['sqrt', 'log2']
        }
    },
    
    "Gradient Boosting": {
        "description": """
        üöÄ GRADIENT BOOSTING
        
        Principe:
        - Construit les arbres s√©quentiellement
        - Chaque nouvel arbre corrige les erreurs du pr√©c√©dent
        - Minimise une fonction de perte par descente de gradient
        - Combine des "weak learners" en un "strong learner"
        
        Avantages:
        ‚úì Souvent le plus performant sur des donn√©es tabulaires
        ‚úì Capture des relations complexes
        ‚úì Moins sensible aux outliers que Random Forest
        ‚úì Flexibilit√© dans la fonction de perte
        
        Inconv√©nients:
        ‚úó Sensible au sur-apprentissage si mal param√©tr√©
        ‚úó Plus lent (s√©quentiel, non parall√©lisable)
        ‚úó N√©cessite un tuning minutieux
        
        Hyperparam√®tres cl√©s:
        ‚Ä¢ n_estimators: Nombre d'arbres
        ‚Ä¢ learning_rate: Taux d'apprentissage (plus petit = plus robuste)
        ‚Ä¢ max_depth: Profondeur (g√©n√©ralement plus faible que RF)
        ‚Ä¢ subsample: Fraction d'√©chantillons par arbre (r√©gularisation)
        """,
        
        "param_grid": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [4, 6, 8],
            'min_samples_split': [5, 10, 15],
            'subsample': [0.8, 0.9, 1.0],
            'max_features': ['sqrt', 'log2']
        }
    },
    
    "XGBoost": {
        "description": """
        ‚ö° XGBoost (eXtreme Gradient Boosting)
        
        Principe:
        - Version optimis√©e et r√©gularis√©e du Gradient Boosting
        - Utilise des techniques avanc√©es (regularization, pruning, parallelization)
        - Algorithme de splitting plus efficace
        - Gestion native des valeurs manquantes
        
        Avantages:
        ‚úì Souvent le meilleur en comp√©titions (Kaggle)
        ‚úì Plus rapide que Gradient Boosting classique
        ‚úì R√©gularisation L1/L2 int√©gr√©e
        ‚úì Gestion automatique des missing values
        
        Inconv√©nients:
        ‚úó Beaucoup d'hyperparam√®tres √† tuner
        ‚úó Peut √™tre "overkill" pour des probl√®mes simples
        ‚úó N√©cessite compr√©hension approfondie
        
        Hyperparam√®tres cl√©s:
        ‚Ä¢ n_estimators: Nombre d'arbres
        ‚Ä¢ learning_rate: Taux d'apprentissage
        ‚Ä¢ max_depth: Profondeur des arbres
        ‚Ä¢ colsample_bytree: Fraction de features par arbre
        ‚Ä¢ reg_alpha/reg_lambda: R√©gularisation L1/L2
        """,
        
        "param_grid": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [4, 6, 8],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
    },
    
    "MLP": {
        "description": """
        üß† R√âSEAU DE NEURONES (Multi-Layer Perceptron)
        
        Principe:
        - R√©seau de neurones artificiels organis√©s en couches
        - Chaque neurone applique: activation(weighted_sum(inputs) + bias)
        - Apprentissage par r√©tropropagation du gradient
        - Peut approximer n'importe quelle fonction (th√©or√®me d'approximation universelle)
        
        Avantages:
        ‚úì Peut capturer des relations tr√®s complexes
        ‚úì Flexible et adaptable
        ‚úì Bonne g√©n√©ralisation avec assez de donn√©es
        
        Inconv√©nients:
        ‚úó N√©cessite BEAUCOUP de donn√©es (milliers/millions)
        ‚úó Bo√Æte noire (difficile √† interpr√©ter)
        ‚úó Sensible au scaling des features
        ‚úó Lent √† entra√Æner
        ‚úó Instable (r√©sultats variables)
        
        Hyperparam√®tres cl√©s:
        ‚Ä¢ hidden_layer_sizes: Architecture (nombre et taille des couches)
        ‚Ä¢ activation: Fonction d'activation (relu, tanh)
        ‚Ä¢ alpha: R√©gularisation L2
        ‚Ä¢ learning_rate_init: Taux d'apprentissage initial
        
        Note: G√©n√©ralement sous-performant sur petits datasets tabulaires
        """,
        
        "param_grid": {
            'hidden_layer_sizes': [(64,), (128, 64), (128, 64, 32)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate_init': [0.001, 0.01]
        }
    }
}

# Afficher la th√©orie
for model_name, info in models_theory.items():
    print(f"\n{info['description']}")
    print("-" * 100)

print("\n" + "=" * 100)
print("PARTIE 2: CHARGEMENT ET PR√âPARATION DES DONN√âES")
print("=" * 100)

# Charger les donn√©es
df = pd.read_csv("../data/clean/cleaned_data_filtered.csv")
print(f"\n‚úì Dataset: {df.shape[0]} propri√©t√©s, {df.shape[1]} colonnes")

# Feature Engineering SANS DATA LEAKAGE
print("\nüìä Feature Engineering (sans data leakage)...")

# Features de base
df['room_bathroom_ratio'] = df['room_count'] / np.maximum(df['bathroom_count'], 1)
df['total_rooms'] = df['room_count'] + df['bathroom_count']
df['size_per_room'] = df['size'] / np.maximum(df['room_count'], 1)
df['bathroom_density'] = df['bathroom_count'] / np.maximum(df['size'], 1)
df['size_x_rooms'] = df['size'] * df['room_count']
df['size_x_bathrooms'] = df['size'] * df['bathroom_count']

# SOLUTION 2: Luxury Score (calculable sans conna√Ætre le prix!)
print("  ‚Üí Cr√©ation du luxury_score (pas de leakage)")
high_value_locations = ['La Marsa', 'Carthage', 'Sidi Bou Said', 'Gammarth', 
                        'Les Berges du Lac', 'Lac 1', 'Lac 2']
df['is_premium_location'] = df['location'].isin(high_value_locations).astype(int)

# Score de luxe bas√© sur features disponibles
df['luxury_score'] = (
    (df['size'] / 100) * 0.3 +           # Grande surface
    (df['room_count'] / 5) * 0.2 +       # Beaucoup de chambres
    (df['bathroom_count'] / 2) * 0.2 +   # Beaucoup de SdB
    df['is_premium_location'] * 0.3      # Zone premium
)

# Segmentation bas√©e sur luxury_score
df['property_tier'] = pd.cut(df['luxury_score'], 
                              bins=3, 
                              labels=['standard', 'upscale', 'luxury'])

engineered_cols = ['room_bathroom_ratio', 'total_rooms', 'size_per_room', 
                   'bathroom_density', 'size_x_rooms', 'size_x_bathrooms',
                   'luxury_score', 'is_premium_location']
for col in engineered_cols:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(df[col].median())

print("  ‚Üí Features cr√©√©es: luxury_score, property_tier, is_premium_location")
print("  ‚úì Aucune information du prix utilis√©e!")

# Pr√©parer pour cross-validation avec calcul du prix m√©dian par zone
# Note: Le prix m√©dian sera calcul√© sur le train set uniquement dans la CV
categorical_cols = ['category', 'type', 'location', 'property_tier']

print("\nüìä Pr√©paration des donn√©es pour cross-validation...")
print("  Note: Le prix m√©dian par zone sera calcul√© sur train set uniquement")

# Configuration des mod√®les avec leurs grilles
models_config = {
    "Decision Tree": {
        "model": DecisionTreeRegressor(random_state=42),
        "params": models_theory["Decision Tree"]["param_grid"]
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42, n_jobs=-1),
        "params": models_theory["Random Forest"]["param_grid"]
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": models_theory["Gradient Boosting"]["param_grid"]
    },
    "XGBoost": {
        "model": XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror'),
        "params": models_theory["XGBoost"]["param_grid"]
    },
    "MLP": {
        "model": MLPRegressor(random_state=42, max_iter=500, early_stopping=True),
        "params": models_theory["MLP"]["param_grid"]
    }
}

print("\n" + "=" * 100)
print("PARTIE 3: OPTIMISATION DES HYPERPARAM√àTRES (Grid Search CV)")
print("=" * 100)
print("\nCette √©tape peut prendre 15-30 minutes selon votre machine...")
print("Grid Search utilise 3-fold CV pour chaque combinaison de param√®tres.\n")

# Fonction pour ajouter les statistiques de location SANS LEAKAGE
def add_location_features(df_train, df_test, df_full):
    """
    Ajoute les prix m√©dians par zone calcul√©s sur train uniquement
    """
    # Calculer sur train uniquement
    location_stats = df_train.groupby('location').agg({
        'price': ['median', 'mean', 'std', 'count']
    })
    location_stats.columns = ['location_price_median', 'location_price_mean', 
                              'location_price_std', 'location_count']
    location_stats = location_stats.reset_index()
    
    # Normaliser les stats de prix (pour √©viter le leakage direct)
    location_stats['location_price_level'] = (
        location_stats['location_price_median'] / location_stats['location_price_median'].median()
    )
    
    # Merger sur train et test
    df_train_merged = df_train.merge(location_stats[['location', 'location_price_level']], 
                                     on='location', how='left')
    df_test_merged = df_test.merge(location_stats[['location', 'location_price_level']], 
                                   on='location', how='left')
    
    # Remplir les valeurs manquantes (nouvelles locations) avec la m√©diane
    global_median = location_stats['location_price_level'].median()
    df_train_merged['location_price_level'] = df_train_merged['location_price_level'].fillna(global_median)
    df_test_merged['location_price_level'] = df_test_merged['location_price_level'].fillna(global_median)
    
    return df_train_merged, df_test_merged, location_stats

# Optimiser chaque mod√®le
optimized_models = {}
optimization_results = []

for name, config in models_config.items():
    print(f"\n{'='*80}")
    print(f"üîß Optimisation: {name}")
    print(f"{'='*80}")
    
    n_combinations = np.prod([len(v) for v in config['params'].values()])
    print(f"Nombre de combinaisons √† tester: {n_combinations}")
    
    start_time = time.time()
    
    # Custom CV pour ajouter location_price_level sans leakage
    kf_opt = KFold(n_splits=3, shuffle=True, random_state=42)
    best_score = -np.inf
    best_params = None
    best_estimator = None
    
    # Grid Search manuel pour contr√¥ler le preprocessing
    from sklearn.model_selection import ParameterGrid
    param_grid = list(ParameterGrid(config['params']))
    
    print(f"Testing {len(param_grid)} combinations...")
    
    for i, params in enumerate(param_grid):
        if i % 10 == 0:
            print(f"  Progress: {i}/{len(param_grid)}...", end='\r')
        
        scores = []
        for train_idx, test_idx in kf_opt.split(df):
            # Split data
            df_train_fold = df.iloc[train_idx].copy()
            df_test_fold = df.iloc[test_idx].copy()
            
            # Add location features (calculated on train only)
            df_train_fold, df_test_fold, _ = add_location_features(
                df_train_fold, df_test_fold, df
            )
            
            # One-hot encoding
            df_train_encoded = pd.get_dummies(df_train_fold, columns=categorical_cols, drop_first=False)
            df_test_encoded = pd.get_dummies(df_test_fold, columns=categorical_cols, drop_first=False)
            
            # Align columns
            missing_cols = set(df_train_encoded.columns) - set(df_test_encoded.columns)
            for col in missing_cols:
                df_test_encoded[col] = 0
            df_test_encoded = df_test_encoded[df_train_encoded.columns]
            
            # Features and target
            feature_cols_fold = [col for col in df_train_encoded.columns 
                                if col not in ['price', 'log_price']]
            X_train_fold = df_train_encoded[feature_cols_fold]
            y_train_fold = df_train_encoded['log_price']
            X_test_fold = df_test_encoded[feature_cols_fold]
            y_test_fold = df_test_encoded['log_price']
            
            # Scale
            scaler_fold = StandardScaler()
            num_cols_fold = ['room_count', 'bathroom_count', 'size', 'room_bathroom_ratio', 
                            'total_rooms', 'size_per_room', 'bathroom_density',
                            'size_x_rooms', 'size_x_bathrooms', 'luxury_score',
                            'is_premium_location', 'location_price_level']
            num_cols_present = [col for col in num_cols_fold if col in X_train_fold.columns]
            X_train_fold[num_cols_present] = scaler_fold.fit_transform(X_train_fold[num_cols_present])
            X_test_fold[num_cols_present] = scaler_fold.transform(X_test_fold[num_cols_present])
            
            # Train and score
            model_fold = config['model'].__class__(**params, random_state=42)
            if hasattr(model_fold, 'n_jobs'):
                model_fold.n_jobs = -1
            model_fold.fit(X_train_fold, y_train_fold)
            score = model_fold.score(X_test_fold, y_test_fold)
            scores.append(score)
        
        avg_score = np.mean(scores)
        if avg_score > best_score:
            best_score = avg_score
            best_params = params
            best_estimator = config['model'].__class__(**params, random_state=42)
            if hasattr(best_estimator, 'n_jobs'):
                best_estimator.n_jobs = -1
    
    elapsed = time.time() - start_time
    
    print(f"\n‚úì Termin√© en {elapsed:.1f}s")
    print(f"Meilleur score R¬≤: {best_score:.4f}")
    print(f"Meilleurs param√®tres:")
    for param, value in best_params.items():
        print(f"  ‚Ä¢ {param}: {value}")
    
    optimized_models[name] = best_estimator
    
    optimization_results.append({
        'Model': name,
        'Best_R2': best_score,
        'Best_Params': best_params,
        'Time_seconds': elapsed
    })

print("\n" + "=" * 100)
print("PARTIE 4: √âVALUATION FINALE DES MOD√àLES OPTIMIS√âS")
print("=" * 100)

# Pr√©parer donn√©es compl√®tes pour √©valuation finale
# Calculer location_price_level sur l'ensemble complet (pour training final)
location_stats_full = df.groupby('location')['price'].median().reset_index()
location_stats_full['location_price_level'] = (
    location_stats_full['price'] / location_stats_full['price'].median()
)
df_final = df.merge(location_stats_full[['location', 'location_price_level']], 
                   on='location', how='left')
df_final['location_price_level'] = df_final['location_price_level'].fillna(1.0)

# One-hot encoding
df_encoded = pd.get_dummies(df_final, columns=categorical_cols, drop_first=False)

feature_cols = [col for col in df_encoded.columns if col not in ['price', 'log_price']]
X = df_encoded[feature_cols].copy()
y = df_encoded['log_price'].copy()

num_cols = ['room_count', 'bathroom_count', 'size', 'room_bathroom_ratio', 
            'total_rooms', 'size_per_room', 'bathroom_density',
            'size_x_rooms', 'size_x_bathrooms', 'luxury_score',
            'is_premium_location', 'location_price_level']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

print(f"‚úì Features finales: {X.shape[1]} colonnes")
print(f"‚úì Target: log10(price)")

# √âvaluation avec 5-fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
final_results = {}

for name, model in optimized_models.items():
    print(f"\nüìä √âvaluation finale: {name}...")
    
    # Cross-validation scores
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    
    # Pr√©dictions pour m√©triques d√©taill√©es
    y_pred_log = np.zeros_like(y)
    for train_idx, test_idx in kf.split(X):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        y_pred_log[test_idx] = model.predict(X.iloc[test_idx])
    
    # M√©triques log-space
    rmse_log = np.sqrt(mean_squared_error(y, y_pred_log))
    mae_log = mean_absolute_error(y, y_pred_log)
    r2_log = r2_score(y, y_pred_log)
    
    # Back-transform avec bias correction
    y_actual = 10 ** y
    y_pred_raw = 10 ** y_pred_log
    
    df_temp = pd.DataFrame({
        'actual_log': y.values,
        'pred_log': y_pred_log,
        'type': df['type'].values
    })
    
    bias_factors = {}
    for prop_type in df_temp['type'].unique():
        mask = df_temp['type'] == prop_type
        mean_actual = np.mean(df_temp.loc[mask, 'actual_log'])
        mean_pred = np.mean(df_temp.loc[mask, 'pred_log'])
        bias_factors[prop_type] = 10 ** (mean_actual - mean_pred)
    
    y_pred_corrected = np.array([
        y_pred_raw[i] * bias_factors[df['type'].iloc[i]] 
        for i in range(len(y_pred_raw))
    ])
    
    # M√©triques espace r√©el
    rmse_actual = np.sqrt(mean_squared_error(y_actual, y_pred_corrected))
    mae_actual = mean_absolute_error(y_actual, y_pred_corrected)
    mape = np.mean(np.abs((y_actual - y_pred_corrected) / y_actual) * 100)
    
    final_results[name] = {
        'R2_log': r2_log,
        'R2_std': r2_scores.std(),
        'RMSE_log': rmse_log,
        'MAE_log': mae_log,
        'RMSE_actual': rmse_actual,
        'MAE_actual': mae_actual,
        'MAPE': mape
    }

# Cr√©er tableau r√©capitulatif
results_df = pd.DataFrame(final_results).T
results_df = results_df.sort_values('R2_log', ascending=False)

print("\n" + "=" * 100)
print("TABLEAU R√âCAPITULATIF DES PERFORMANCES FINALES")
print("=" * 100)
print("\n", results_df.round(4).to_string())

# Identifier le meilleur
best_model_name = results_df['R2_log'].idxmax()
best_model = optimized_models[best_model_name]

print("\n" + "=" * 100)
print("üèÜ MOD√àLE CHAMPION")
print("=" * 100)
print(f"\nLe meilleur mod√®le est: {best_model_name}")
print(f"  ‚Ä¢ R¬≤ = {results_df.loc[best_model_name, 'R2_log']:.4f}")
print(f"  ‚Ä¢ MAE = {results_df.loc[best_model_name, 'MAE_actual']:,.0f} TND")
print(f"  ‚Ä¢ MAPE = {results_df.loc[best_model_name, 'MAPE']:.2f}%")

print("\nüéì JUSTIFICATION DU CHOIX:")
print(f"  {best_model_name} a √©t√© s√©lectionn√© car il pr√©sente:")
print(f"  1. Le meilleur R¬≤ ({results_df.loc[best_model_name, 'R2_log']:.4f}) = meilleure capacit√© explicative")
print(f"  2. MAPE acceptable ({results_df.loc[best_model_name, 'MAPE']:.1f}%) pour l'immobilier")
print(f"  3. Robustesse confirm√©e par validation crois√©e")

print("\nüí° NOTE SUR LE DATA LEAKAGE:")
print("  Ce mod√®le utilise des features SANS data leakage:")
print("  ‚Ä¢ luxury_score: calcul√© √† partir de features disponibles")
print("  ‚Ä¢ location_price_level: calcul√© sur train set uniquement en CV")
print("  ‚Ä¢ property_tier: bas√© sur luxury_score")
print("  ‚úì Aucune information du prix target utilis√©e!")

# Entra√Æner le mod√®le final sur toutes les donn√©es
print("\nüì¶ Entra√Ænement du mod√®le final sur toutes les donn√©es...")
best_model.fit(X, y)

# Sauvegarder le mod√®le et les objets n√©cessaires
joblib.dump(best_model, '../output/best_model.pkl')
joblib.dump(scaler, '../output/scaler.pkl')
joblib.dump(feature_cols, '../output/feature_cols.pkl')
joblib.dump(location_stats_full, '../output/location_stats.pkl')
joblib.dump(high_value_locations, '../output/premium_locations.pkl')

print("\n‚úÖ Mod√®le sauvegard√©:")
print("  ‚Ä¢ ../output/best_model.pkl")
print("  ‚Ä¢ ../output/scaler.pkl")
print("  ‚Ä¢ ../output/feature_cols.pkl")
print("  ‚Ä¢ ../output/location_stats.pkl (pour production)")
print("  ‚Ä¢ ../output/premium_locations.pkl")

# Export r√©sultats
results_df.to_csv('../output/final_comparison.csv')
print("  ‚Ä¢ ../output/final_comparison.csv")

print("\n" + "=" * 100)
print("‚úÖ PROJET TERMIN√â!")
print("=" * 100)
print("\nProc√©dez maintenant √† 'prediction_interface.py' pour tester le mod√®le!")

PARTIE 1: FONDEMENTS TH√âORIQUES DES MOD√àLES


        üå≥ ARBRE DE D√âCISION (Decision Tree)

        Principe:
        - Divise r√©cursivement l'espace des features en r√©gions rectangulaires
        - √Ä chaque n≈ìud, choisit la meilleure variable et seuil pour minimiser l'erreur
        - Pr√©diction = moyenne des valeurs dans chaque feuille

        Avantages:
        ‚úì Interpr√©table et visualisable
        ‚úì Capture les relations non-lin√©aires
        ‚úì Pas besoin de normalisation
        ‚úì Tr√®s rapide

        Inconv√©nients:
        ‚úó Tendance au sur-apprentissage
        ‚úó Instable (petits changements = grands impacts)
        ‚úó Moins performant que les ensembles

        Hyperparam√®tres cl√©s:
        ‚Ä¢ max_depth: Profondeur maximale (contr√¥le la complexit√©)
        ‚Ä¢ min_samples_split: Minimum d'√©chantillons pour diviser un n≈ìud
        ‚Ä¢ min_samples_leaf: Minimum d'√©chantillons par feuille
        
---------------------------------------------

In [2]:
"""
================================================================================
INTERFACE DE PR√âDICTION - PRIX IMMOBILIER TUNISIE
================================================================================
Ce script charge le mod√®le entra√Æn√© et permet de faire des pr√©dictions
sur de nouvelles propri√©t√©s SANS DATA LEAKAGE.

Features utilis√©es:
- luxury_score: calcul√© √† partir de size, rooms, bathrooms, location
- location_price_level: prix m√©dian relatif de la zone (calcul√© sur train)
- property_tier: segment bas√© sur luxury_score
================================================================================
"""

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

# Charger le mod√®le et les objets n√©cessaires
print("=" * 80)
print("CHARGEMENT DU MOD√àLE ENTRA√éN√â")
print("=" * 80)

model = joblib.load('../output/best_model.pkl')
scaler = joblib.load('../output/scaler.pkl')
feature_cols = joblib.load('../output/feature_cols.pkl')
location_stats = joblib.load('../output/location_stats.pkl')
premium_locations = joblib.load('../output/premium_locations.pkl')

print(f"‚úì Mod√®le charg√©: {type(model).__name__}")
print(f"‚úì Nombre de features: {len(feature_cols)}")
print(f"‚úì Stats de {len(location_stats)} locations charg√©es")

# Listes des options
VILLES = [
    'Tunis', 'Sfax', 'Sousse', 'Kairouan', 'Bizerte', 'Gab√®s', 'Ariana', 
    'Gafsa', 'Monastir', 'Ben Arous', 'Kasserine', 'M√©denine', 'Nabeul', 
    'Tataouine', 'B√©ja', 'Jendouba', 'Mahdia', 'Siliana', 'K√©bili', 
    'Zaghouan', 'Manouba', 'Tozeur', 'Sidi Bouzid', 'La Marsa', 'Hammamet'
]

CATEGORIES = [
    'Appartements', 'Maisons', 'Villas', 'Studios', 'Duplex', 'Terrains',
    'Bureaux et Plateaux', 'Locations de vacances', 'Terrains et Fermes',
    'Colocations', 'Magasins, Commerces et Locaux industriels'
]

TYPES_TRANSACTION = ['√Ä Vendre', '√Ä Louer']


def engineer_features(data, location):
    """
    Applique le feature engineering sur les donn√©es d'entr√©e
    SANS DATA LEAKAGE - tout est calculable sans conna√Ætre le prix!
    """
    # Features de base
    data['room_bathroom_ratio'] = data['room_count'] / np.maximum(data['bathroom_count'], 1)
    data['total_rooms'] = data['room_count'] + data['bathroom_count']
    data['size_per_room'] = data['size'] / np.maximum(data['room_count'], 1)
    data['bathroom_density'] = data['bathroom_count'] / np.maximum(data['size'], 1)
    data['size_x_rooms'] = data['size'] * data['room_count']
    data['size_x_bathrooms'] = data['size'] * data['bathroom_count']
    
    # Premium location indicator
    data['is_premium_location'] = (location in premium_locations) * 1
    
    # Luxury score (SANS LEAKAGE - bas√© uniquement sur features disponibles)
    data['luxury_score'] = (
        (data['size'] / 100) * 0.3 +
        (data['room_count'] / 5) * 0.2 +
        (data['bathroom_count'] / 2) * 0.2 +
        data['is_premium_location'] * 0.3
    )
    
    # Property tier bas√© sur luxury_score
    if data['luxury_score'].values[0] < 0.5:
        property_tier = 'standard'
    elif data['luxury_score'].values[0] < 1.0:
        property_tier = 'upscale'
    else:
        property_tier = 'luxury'
    
    data['property_tier'] = property_tier
    
    # Location price level (calcul√© sur train set, stock√© dans location_stats)
    location_price_level = location_stats[
        location_stats['location'] == location
    ]['location_price_level'].values
    
    if len(location_price_level) > 0:
        data['location_price_level'] = location_price_level[0]
    else:
        # Ville inconnue -> utiliser la m√©diane
        data['location_price_level'] = 1.0
    
    # Gestion des infinis
    for col in ['room_bathroom_ratio', 'total_rooms', 'size_per_room', 
                'bathroom_density', 'size_x_rooms', 'size_x_bathrooms', 'luxury_score']:
        data[col] = data[col].replace([np.inf, -np.inf], np.nan).fillna(data[col].median())
    
    return data


def prepare_input_for_prediction(room_count, bathroom_count, size, location, 
                                  category, transaction_type):
    """
    Pr√©pare les donn√©es d'entr√©e pour la pr√©diction
    
    Args:
        room_count (int): Nombre de chambres
        bathroom_count (int): Nombre de salles de bain
        size (float): Surface en m¬≤
        location (str): Ville
        category (str): Type de bien
        transaction_type (str): '√Ä Vendre' ou '√Ä Louer'
    
    Returns:
        pd.DataFrame: Features pr√©par√©es pour le mod√®le
    """
    
    # Cr√©er DataFrame de base
    input_data = pd.DataFrame({
        'room_count': [room_count],
        'bathroom_count': [bathroom_count],
        'size': [size],
        'category': [category],
        'type': [transaction_type],
        'location': [location]
    })
    
    # Feature engineering (SANS LEAKAGE!)
    input_data = engineer_features(input_data, location)
    
    # One-hot encoding
    input_encoded = pd.get_dummies(input_data, 
                                    columns=['category', 'type', 'location', 'property_tier'])
    
    # Cr√©er un DataFrame avec toutes les features attendues
    X_pred = pd.DataFrame(0, index=[0], columns=feature_cols)
    
    # Remplir les colonnes pr√©sentes
    for col in input_encoded.columns:
        if col in X_pred.columns:
            X_pred[col] = input_encoded[col].values
    
    # Standardiser les features num√©riques
    num_cols = ['room_count', 'bathroom_count', 'size', 'room_bathroom_ratio', 
                'total_rooms', 'size_per_room', 'bathroom_density',
                'size_x_rooms', 'size_x_bathrooms', 'luxury_score',
                'is_premium_location', 'location_price_level']
    
    X_pred[num_cols] = scaler.transform(X_pred[num_cols])
    
    return X_pred, transaction_type


def predict_price(room_count, bathroom_count, size, location, category, transaction_type):
    """
    Pr√©dit le prix d'une propri√©t√©
    
    Returns:
        tuple: (prix_pr√©dit, intervalle_confiance_bas, intervalle_confiance_haut)
    """
    
    # Pr√©parer les donn√©es
    X_pred, prop_type = prepare_input_for_prediction(
        room_count, bathroom_count, size, location, category, transaction_type
    )
    
    # Pr√©diction en log-space
    log_price_pred = model.predict(X_pred)[0]
    
    # Back-transform
    price_pred = 10 ** log_price_pred
    
    # Bias correction bas√© sur le type (calcul√© lors de l'entra√Ænement)
    bias_factors = {'√Ä Vendre': 1.0022, '√Ä Louer': 0.9935}
    price_pred = price_pred * bias_factors.get(prop_type, 1.0)
    
    # Intervalle de confiance approximatif (¬±20% pour l'immobilier)
    conf_low = price_pred * 0.8
    conf_high = price_pred * 1.2
    
    return price_pred, conf_low, conf_high


# ============================================================================
# EXEMPLES D'UTILISATION
# ============================================================================

print("\n" + "=" * 80)
print("EXEMPLES DE PR√âDICTIONS")
print("=" * 80)

examples = [
    {
        "description": "Appartement √† louer √† Tunis",
        "room_count": 3,
        "bathroom_count": 2,
        "size": 120,
        "location": "Tunis",
        "category": "Appartements",
        "transaction_type": "√Ä Louer"
    },
    {
        "description": "Villa √† vendre √† La Marsa (zone premium)",
        "room_count": 5,
        "bathroom_count": 3,
        "size": 300,
        "location": "La Marsa",
        "category": "Villas",
        "transaction_type": "√Ä Vendre"
    },
    {
        "description": "Studio √† louer √† Sousse",
        "room_count": 1,
        "bathroom_count": 1,
        "size": 35,
        "location": "Sousse",
        "category": "Studios",
        "transaction_type": "√Ä Louer"
    },
    {
        "description": "Maison √† vendre √† Sfax",
        "room_count": 4,
        "bathroom_count": 2,
        "size": 200,
        "location": "Sfax",
        "category": "Maisons",
        "transaction_type": "√Ä Vendre"
    }
]

for i, example in enumerate(examples, 1):
    print(f"\n{'‚îÄ' * 80}")
    print(f"Exemple {i}: {example['description']}")
    print(f"{'‚îÄ' * 80}")
    print(f"Caract√©ristiques:")
    print(f"  ‚Ä¢ Chambres: {example['room_count']}")
    print(f"  ‚Ä¢ Salles de bain: {example['bathroom_count']}")
    print(f"  ‚Ä¢ Surface: {example['size']} m¬≤")
    print(f"  ‚Ä¢ Ville: {example['location']}")
    print(f"  ‚Ä¢ Cat√©gorie: {example['category']}")
    print(f"  ‚Ä¢ Type: {example['transaction_type']}")
    
    price, conf_low, conf_high = predict_price(
        example['room_count'],
        example['bathroom_count'],
        example['size'],
        example['location'],
        example['category'],
        example['transaction_type']
    )
    
    print(f"\nüí∞ PR√âDICTION:")
    if example['transaction_type'] == '√Ä Louer':
        print(f"  Prix estim√©: {price:,.0f} TND/mois")
        print(f"  Intervalle de confiance: {conf_low:,.0f} - {conf_high:,.0f} TND/mois")
    else:
        print(f"  Prix estim√©: {price:,.0f} TND")
        print(f"  Intervalle de confiance: {conf_low:,.0f} - {conf_high:,.0f} TND")


# ============================================================================
# INTERFACE INTERACTIVE
# ============================================================================

print("\n\n" + "=" * 80)
print("INTERFACE DE PR√âDICTION INTERACTIVE")
print("=" * 80)

def interactive_prediction():
    """
    Permet √† l'utilisateur de faire des pr√©dictions interactivement
    """
    
    print("\nüìù Entrez les caract√©ristiques de la propri√©t√©:")
    print("(Appuyez sur Ctrl+C pour quitter)\n")
    
    while True:
        try:
            # Collecte des inputs
            print("‚îÄ" * 80)
            room_count = int(input("Nombre de chambres: "))
            bathroom_count = int(input("Salles de bain: "))
            size = float(input("Surface (m¬≤): "))
            
            print(f"\nVilles disponibles: {', '.join(VILLES[:10])}... (et autres)")
            location = input("Ville: ").strip()
            if location not in VILLES:
                print(f"‚ö†Ô∏è  Ville inconnue. Utilisation de 'Tunis' par d√©faut.")
                location = 'Tunis'
            
            print(f"\nCat√©gories: {', '.join(CATEGORIES[:5])}... (et autres)")
            category = input("Cat√©gorie: ").strip()
            if category not in CATEGORIES:
                print(f"‚ö†Ô∏è  Cat√©gorie inconnue. Utilisation de 'Appartements' par d√©faut.")
                category = 'Appartements'
            
            print(f"\nType de transaction: {', '.join(TYPES_TRANSACTION)}")
            transaction_type = input("Type: ").strip()
            if transaction_type not in TYPES_TRANSACTION:
                print(f"‚ö†Ô∏è  Type inconnu. Utilisation de '√Ä Vendre' par d√©faut.")
                transaction_type = '√Ä Vendre'
            
            # Pr√©diction
            print("\nüîÆ Calcul de la pr√©diction...\n")
            price, conf_low, conf_high = predict_price(
                room_count, bathroom_count, size, location, category, transaction_type
            )
            
            print("=" * 80)
            print("üí∞ R√âSULTAT DE LA PR√âDICTION")
            print("=" * 80)
            
            if transaction_type == '√Ä Louer':
                print(f"\n  Prix estim√©: {price:,.0f} TND/mois")
                print(f"  Intervalle 80%: {conf_low:,.0f} - {conf_high:,.0f} TND/mois")
            else:
                print(f"\n  Prix estim√©: {price:,.0f} TND")
                print(f"  Intervalle 80%: {conf_low:,.0f} - {conf_high:,.0f} TND")
            
            print("\n" + "=" * 80)
            
            # Demander si continuer
            continue_input = input("\nFaire une autre pr√©diction? (o/n): ").strip().lower()
            if continue_input != 'o':
                break
                
        except KeyboardInterrupt:
            print("\n\nüëã Au revoir!")
            break
        except ValueError as e:
            print(f"\n‚ùå Erreur: Entr√©e invalide. Veuillez entrer des nombres valides.")
        except Exception as e:
            print(f"\n‚ùå Erreur: {str(e)}")

print("\n" + "=" * 80)
print("üí° GARANTIE SANS DATA LEAKAGE")
print("=" * 80)
print("""
Ce syst√®me de pr√©diction est con√ßu SANS data leakage:

‚úì luxury_score: Calcul√© √† partir de size, rooms, bathrooms (disponibles!)
‚úì location_price_level: Prix m√©dian relatif (calcul√© sur train, stock√©)
‚úì property_tier: Bas√© sur luxury_score (pas sur le vrai prix)
‚úì is_premium_location: Liste pr√©d√©finie de zones premium

Toutes les features sont calculables AVANT de conna√Ætre le prix!
Le mod√®le est production-ready et coh√©rent train/test.
""")

print("\nüí° TIP: Pour utiliser l'interface interactive, appelez: interactive_prediction()")
print("\nExemple:")
print(">>> interactive_prediction()")

# D√©commenter la ligne suivante pour lancer automatiquement
# interactive_prediction()

CHARGEMENT DU MOD√àLE ENTRA√éN√â
‚úì Mod√®le charg√©: XGBRegressor
‚úì Nombre de features: 46
‚úì Stats de 23 locations charg√©es

EXEMPLES DE PR√âDICTIONS

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Exemple 1: Appartement √† louer √† Tunis
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Caract√©ristiques:
  ‚Ä¢ Chambres: 3
  ‚Ä¢ Salles de bain: 2
  ‚Ä¢ Surface: 120 m¬≤
  ‚Ä¢ Ville: Tunis
  ‚Ä¢ Cat√©gorie: Appartements
  ‚Ä¢ Type: √Ä Louer

üí∞ PR√âDICTION:
  Prix estim√©: 1,124 TND/mois
  Intervalle de confiance: 899 - 1,348 TND/mois

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ