In [11]:
# =========================================================
# V3 STEP 4: MODEL INSIGHTS (The Ultimate Edition + Trend Lines) ü©∫
# =========================================================
# Goal: 
# 1. Analyze Model Accuracy.
# 2. Visualize Raw Macro Correlations with RED TREND LINES.
# 3. Use Robust SHAP (Method B) to explain Model Logic.

import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1. SETUP
BASE_DIR = ".." 
DATA_FILE = os.path.join(BASE_DIR, "data/processed", "hybrid_v3_dataset.csv")
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("ü©∫ Initializing Ultimate Model Analysis...")

# 2. LOAD & PREP DATA
try:
    if not os.path.exists(DATA_FILE):
        raise FileNotFoundError(f"Data file not found at {DATA_FILE}")
    
    df = pd.read_csv(DATA_FILE)
    df = df.dropna(subset=['Average rent ($)'])
    print(f"   ‚úÖ Loaded Data: {len(df)} rows")

    # Feature Engineering
    print("   ‚öôÔ∏è  Encoding Features...")
    df_encoded = pd.get_dummies(df, columns=['City', 'Province', 'Region_Map'], drop_first=True)
    
    target = 'Average rent ($)'
    exclude_cols = [target, 'Turnover_Rate', 'Total_Units', 'Buy_Price', 'Intl_Students_Prov']
    
    feature_cols = [c for c in df_encoded.columns if c not in exclude_cols]
    feature_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df_encoded[c]) or df_encoded[c].dtype == bool]
    
    # FORCE FLOAT
    X = df_encoded[feature_cols].astype(float)
    y = df_encoded[target]

    # Split (Train < 2023, Test >= 2023)
    X_train = X[df_encoded['Year'] < 2023]
    y_train = y[df_encoded['Year'] < 2023]
    X_test = X[df_encoded['Year'] >= 2023]
    y_test = y[df_encoded['Year'] >= 2023]
    
    print(f"   Train: {len(X_train)} | Test: {len(X_test)}")

    # 3. RETRAIN MODEL
    print("   üî• Retraining Fresh XGBoost Model...")
    model = xgb.XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=42,
    )
    model.fit(X_train, y_train, verbose=False)
    print("   ‚úÖ Retraining Complete.")

    # ---------------------------------------------------------
    # IMAGE 1: ACCURACY PLOT
    # ---------------------------------------------------------
    print("   üé® Generating Image 1: Accuracy Plot...")
    preds = model.predict(X_test)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, preds, alpha=0.5, color='blue', label='Predicted')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Fit')
    plt.xlabel('Actual Rent ($)')
    plt.ylabel('Predicted Rent ($)')
    plt.title('V3 Model Accuracy: Actual vs Predicted (2023-2025)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'v3_accuracy_plot.png'))
    plt.close()
    print("      ‚úÖ Saved: v3_accuracy_plot.png")

    # ---------------------------------------------------------
    # IMAGE 2: MACRO CORRELATION GRID (With RED LINES)
    # ---------------------------------------------------------
    print("   üé® Generating Image 2: Macro Correlation Grid...")
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Raw Data: How Macro Factors Correlate with Rent', fontsize=20)
    
    # Helper to plot on grid WITH RED LINE
    def plot_macro(ax, col, title):
        # 1. The Dots (Scatter)
        sns.scatterplot(ax=ax, data=df, x=col, y=target, hue='Year', palette='viridis', alpha=0.6)
        
        # 2. The Red Line (Regression) - Added Here!
        sns.regplot(ax=ax, data=df, x=col, y=target, scatter=False, color='red', line_kws={'lw': 2})
        
        ax.set_title(title, fontsize=14)
        ax.set_ylabel('Average Rent ($)')
    
    plot_macro(axes[0, 0], 'Pop_Growth_Pct', 'Rent vs Population Growth')
    plot_macro(axes[0, 1], 'Interest_Rate', 'Rent vs Interest Rates')
    plot_macro(axes[1, 0], 'Unemployment_Rate', 'Rent vs Unemployment')
    plot_macro(axes[1, 1], 'GDP_Growth_Pct', 'Rent vs GDP Growth')
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(os.path.join(OUTPUT_DIR, 'v3_macro_grid.png'))
    plt.close()
    print("      ‚úÖ Saved: v3_macro_grid.png")

    # ---------------------------------------------------------
    # SHAP ANALYSIS (Method B: Robust Black Box)
    # ---------------------------------------------------------
    print("   üßÆ Calculating SHAP Values (Method B: Robust)...")
    
    # 1. Create Background (Sample from train)
    background = X_train.sample(n=min(100, len(X_train)), random_state=42)
    
    # 2. Define Explainer (Using PREDICT function to avoid file errors)
    explainer = shap.Explainer(model.predict, background)
    
    # 3. Calculate on Test Sample (Top 200 for speed)
    sample_size = min(200, len(X_test))
    X_test_sample = X_test.iloc[:sample_size]
    print(f"      Analyzing top {sample_size} test scenarios...")
    
    shap_values = explainer(X_test_sample)

    # ---------------------------------------------------------
    # IMAGE 3: SHAP SUMMARY
    # ---------------------------------------------------------
    print("   üé® Generating Image 3: SHAP Summary...")
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, X_test_sample, max_display=15, show=False)
    plt.title("Top 15 Drivers of Rent (2023-2025)", fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'shap_summary.png'))
    plt.close()
    print("      ‚úÖ Saved: shap_summary.png")

    # ---------------------------------------------------------
    # IMAGE 4 & 5: DEPENDENCE PLOTS
    # ---------------------------------------------------------
    print("   üé® Generating Images 4 & 5: Dependence Curves...")
    
    # Get values safely
    shap_vals_data = shap_values.values if hasattr(shap_values, "values") else shap_values

    # Pop Growth
    if "Pop_Growth_Pct" in X_test_sample.columns:
        plt.figure(figsize=(10, 6))
        shap.dependence_plot("Pop_Growth_Pct", shap_vals_data, X_test_sample, interaction_index=None, show=False)
        plt.title("Impact of Population Growth on Rent", fontsize=14)
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, 'shap_population.png'))
        plt.close()
        print("      ‚úÖ Saved: shap_population.png")

    # Interest Rates
    if "Interest_Rate" in X_test_sample.columns:
        plt.figure(figsize=(10, 6))
        shap.dependence_plot("Interest_Rate", shap_vals_data, X_test_sample, interaction_index=None, show=False)
        plt.title("Impact of Interest Rates on Rent", fontsize=14)
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, 'shap_interest_rates.png'))
        plt.close()
        print("      ‚úÖ Saved: shap_interest_rates.png")

    print(f"\n‚ú® COMPLETE! Check '{OUTPUT_DIR}' for all 5 images.")

except Exception as e:
    print(f"‚ùå FATAL ERROR: {e}")

ü©∫ Initializing Ultimate Model Analysis...
   ‚úÖ Loaded Data: 2189 rows
   ‚öôÔ∏è  Encoding Features...
   Train: 1586 | Test: 603
   üî• Retraining Fresh XGBoost Model...
   ‚úÖ Retraining Complete.
   üé® Generating Image 1: Accuracy Plot...
      ‚úÖ Saved: v3_accuracy_plot.png
   üé® Generating Image 2: Macro Correlation Grid...
      ‚úÖ Saved: v3_macro_grid.png
   üßÆ Calculating SHAP Values (Method B: Robust)...
      Analyzing top 200 test scenarios...


PermutationExplainer explainer: 201it [00:16,  5.48it/s]                         
  shap.summary_plot(shap_values, X_test_sample, max_display=15, show=False)


   üé® Generating Image 3: SHAP Summary...
      ‚úÖ Saved: shap_summary.png
   üé® Generating Images 4 & 5: Dependence Curves...
      ‚úÖ Saved: shap_population.png
      ‚úÖ Saved: shap_interest_rates.png

‚ú® COMPLETE! Check '../output' for all 5 images.


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

# üìä V3 Model Analysis: Interpreting the Results

This section explains the 5 key visualizations generated by the V3 Model "X-Ray."

## 1. Model Accuracy (`v3_accuracy_plot.png`)
**What is it?** A scatter plot comparing the **Actual Rent** (X-axis) vs. what the Model **Predicted** (Y-axis) for the test period (2023-2025).

* **The Red Line:** Represents a "Perfect Prediction." If every dot landed exactly on this line, the model would be 100% accurate.
* **The Blue Dots:** represent individual predictions for specific cities and years.
* **How to read it:** * **Tight Cluster:** If the blue dots hug the red line tightly, the model is highly accurate.
    * **Spread:** A wide spread indicates the model is struggling with outliers.
    * **Bias:** If most dots are *below* the red line, the model is consistently "under-predicting" rent.

## 2. Macro Correlations (`v3_macro_grid.png`)
**What is it?**
A raw data visualization showing the relationship between Rent and 4 key Macro Factors, with a **Red Trend Line** added for clarity.

* **Pop Growth:** Does the red line go UP? (More people = Higher Rent).
* **Interest Rates:** Does the red line go UP? (Higher Rates = Higher Rent/Mortgage pass-through).
* **Unemployment:** Does the red line go DOWN? (Job losses = Lower Rent demand).
* **GDP:** Shows the economic health connection.

## 3. The "Brain" of the Model (`shap_summary.png`)
**What is it?**
A "Beeswarm" plot that ranks every feature by how much it influences rent prices.

* **Vertical Order:** Features at the top are the **Most Important** drivers.
* **Color (Red vs. Blue):** * üî¥ **Red** = High Value of that feature (e.g., High Interest Rate).
    * üîµ **Blue** = Low Value of that feature (e.g., Low Interest Rate).
* **Horizontal Position:**
    * **Right of Center (0):** Increases Rent.
    * **Left of Center (0):** Decreases Rent.
* **Example Reading:** If "Pop_Growth_Pct" has **Red dots on the Right**, it means "High Immigration leads to Higher Rent."

## 4. The Immigration Curve (`shap_population.png`)
**What is it?**
A deep dive into exactly *how* Population Growth impacts rent.

* **X-Axis:** The Population Growth % (e.g., 1%, 2%, 3%).
* **Y-Axis:** The impact on Rent in dollars.
* **The Insight:** Look for the **"Hockey Stick"**. Does the line stay flat at 1% growth but shoot up vertically at 2.5%? That specific point is the market's **Saturation Point**.

## 5. The Interest Rate Trap (`shap_interest_rates.png`)
**What is it?**
A deep dive into the impact of Bank of Canada interest rates.

* **The Insight:** This chart settles the debate: "Do high rates lower prices (by crushing demand) or raise prices (by increasing landlord costs)?"
    * **Upward Slope:** Landlords are successfully passing mortgage costs to tenants.
    * **Downward Slope:** High rates are crashing the rental economy.