# HomeVista: Final Model Evaluation & Interpretation

**Goal:** Evaluate the final ensemble model on the test set and interpret its predictions using SHAP values to derive business insights.

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path.cwd().parent / 'src'))

from ml import model_evaluation, feature_engineering
import config

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Models & Data

In [None]:
# Load Models
models, weights, engineer = model_evaluation.load_model_suite()

print("Ensemble Weights:")
for name, weight in weights.items():
    print(f"  {name}: {weight:.3f}")

# Load Data
df = pd.read_csv(config.FILE_ANALYTICAL_DATASET)
X, y, feature_names = engineer.fit_transform(df)

# Split (Same seed as training to ensure we get the same test set)
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Test Set Size: {len(X_test)} samples")

## 3. Ensemble Evaluation

In [None]:
ensemble_pred = model_evaluation.evaluate_ensemble(models, weights, X_test, y_test)

## 4. Error Analysis

Visualizing the residuals to check for bias.

In [None]:
# Actual vs Predicted Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, ensemble_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Rent (AED)')
plt.ylabel('Predicted Rent (AED)')
plt.title('Actual vs. Predicted Rent')
plt.show()

# Residual Plot
residuals = y_test - ensemble_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Residual Distribution')
plt.xlabel('Error (AED)')
plt.show()

## 5. Model Interpretation (SHAP)

Understanding which features drive rental prices.

In [None]:
# Use the best single model for interpretation (usually Random Forest or XGBoost)
best_model_name = max(weights, key=weights.get)
best_model = models[best_model_name]
print(f"Interpreting Best Model: {best_model_name}")

# Calculate SHAP values (using a sample for speed)
X_sample = X_test.sample(1000, random_state=42)
explainer, shap_values = model_evaluation.generate_shap_values(best_model, X_sample)

# Summary Plot
model_evaluation.plot_shap_summary(shap_values, X_sample)