In [1]:
# Real Estate Price Prediction: XGBoost Model Visualization
# --------------------------------------------------------

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import os
import warnings
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable

# If XGBoost_core.py is located one folder above this file,
# append the parent directory to sys.path
# (Adjust as needed for your actual folder structure)
sys.path.append(os.path.dirname(os.getcwd()))

# Import your custom XGBoost module
from XGBoost_core import XGBoostModel

# Configure plot styling for scientific paper quality
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['figure.figsize'] = (12, 8)
mpl.rcParams['figure.dpi'] = 150
mpl.rcParams['font.size'] = 12
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.titlesize'] = 20

# Suppress warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# %% [markdown]
# ## 1. Data Loading and Model Training

# %%
# File paths - update these paths to match your actual files
DATA_PATH = "../../prepared_data.csv"   # Up two folders, then find prepared_data.csv
PREDICT_PATH = "../../to_predict.csv"   # Same idea for to_predict.csv
OUTPUT_DIR = "visualizations"       # Directory to save visualizations

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(exist_ok=True)

# Initialize the XGBoost model
model = XGBoostModel(random_state=42)

# Load and split data
print("Loading data...")
X_train, X_test, y_train, y_test = model.load_data(
    DATA_PATH, 
    target_column='Price',
    test_size=0.2
)

# Train the model
print("Training XGBoost model...")
model.train(X_train, y_train, early_stopping_rounds=50)

# Evaluate the model
print("Evaluating model...")
metrics, y_pred = model.evaluate(X_test, y_test)

# Print metrics
print("\nModel Evaluation:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
2025-04-01 03:48:46,084 - XGBoost_Core - INFO - XGBoost model initialized


Loading data...


2025-04-01 03:48:46,840 - XGBoost_Core - INFO - Data loaded successfully from ../../prepared_data.csv
2025-04-01 03:48:46,841 - XGBoost_Core - INFO - Data shape: (5555, 12)
2025-04-01 03:48:46,867 - XGBoost_Core - INFO - Data split: 4444 training samples, 1111 test samples


Training XGBoost model...
[0]	eval-rmse:357655.39868
[1]	eval-rmse:349392.18772
[2]	eval-rmse:340409.13942
[3]	eval-rmse:332331.41804
[4]	eval-rmse:328730.21079
[5]	eval-rmse:321992.19308
[6]	eval-rmse:319320.68720
[7]	eval-rmse:313274.34028
[8]	eval-rmse:308089.04971
[9]	eval-rmse:303453.49704
[10]	eval-rmse:298457.70467
[11]	eval-rmse:294657.10577
[12]	eval-rmse:290743.97527
[13]	eval-rmse:288317.74987
[14]	eval-rmse:284911.95666
[15]	eval-rmse:281208.21301
[16]	eval-rmse:278086.56856
[17]	eval-rmse:276274.12605
[18]	eval-rmse:273768.46615
[19]	eval-rmse:270585.82169
[20]	eval-rmse:269903.25521
[21]	eval-rmse:267707.75953
[22]	eval-rmse:265499.81683
[23]	eval-rmse:264233.85614
[24]	eval-rmse:263009.32706
[25]	eval-rmse:261732.95220
[26]	eval-rmse:260911.49244
[27]	eval-rmse:259822.76178
[28]	eval-rmse:258710.94283
[29]	eval-rmse:257935.91918
[30]	eval-rmse:256836.94485
[31]	eval-rmse:256250.97889
[32]	eval-rmse:255579.90477
[33]	eval-rmse:255459.32224
[34]	eval-rmse:255083.20933
[35]

2025-04-01 03:48:49,906 - XGBoost_Core - INFO - Best iteration: 491
2025-04-01 03:48:49,906 - XGBoost_Core - INFO - Training model using XGBoost
2025-04-01 03:48:56,285 - XGBoost_Core - INFO - SHAP values calculated
2025-04-01 03:48:56,286 - XGBoost_Core - INFO - Model training completed
2025-04-01 03:48:56,300 - XGBoost_Core - INFO - Predictions made for 1111 samples
2025-04-01 03:48:56,307 - XGBoost_Core - INFO - Model evaluation results:
2025-04-01 03:48:56,308 - XGBoost_Core - INFO -     MAE: 108177.7403
2025-04-01 03:48:56,310 - XGBoost_Core - INFO -     MSE: 34673414298.7303
2025-04-01 03:48:56,311 - XGBoost_Core - INFO -     RMSE: 186207.9867
2025-04-01 03:48:56,311 - XGBoost_Core - INFO -     R2: 0.6487
2025-04-01 03:48:56,312 - XGBoost_Core - INFO -     Mean Percentage Error: 19.4984
2025-04-01 03:48:56,315 - XGBoost_Core - INFO -     Median Percentage Error: 13.5553


Evaluating model...

Model Evaluation:
MAE: 108177.7403
MSE: 34673414298.7303
RMSE: 186207.9867
R2: 0.6487
Mean Percentage Error: 19.4984
Median Percentage Error: 13.5553


# ## 2. Model Performance Visualization

In [None]:
# %%
def create_multiplot_performance():
    """Generate a 4-panel visualization of model performance metrics"""
    fig, axs = plt.subplots(2, 2, figsize=(16, 14))
    fig.suptitle('XGBoost Model Performance Evaluation', fontweight='bold', y=0.98)
    
    # Panel 1: Actual vs Predicted
    ax1 = axs[0, 0]
    ax1.scatter(y_test, y_pred, alpha=0.5, color='#2077B4', edgecolor='k', linewidth=0.5)
    
    # Add the perfect prediction line
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    
    ax1.set_xlabel('Actual Prices')
    ax1.set_ylabel('Predicted Prices')
    ax1.set_title('Actual vs Predicted Prices')
    ax1.annotate(f'R² = {metrics["R2"]:.4f}', xy=(0.05, 0.95), xycoords='axes fraction',
                bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    
    # Panel 2: Residuals
    ax2 = axs[0, 1]
    residuals = y_test - y_pred
    ax2.scatter(y_pred, residuals, alpha=0.5, color='#2077B4', edgecolor='k', linewidth=0.5)
    ax2.axhline(y=0, color='r', linestyle='--', linewidth=2)
    
    ax2.set_xlabel('Predicted Prices')
    ax2.set_ylabel('Residuals')
    ax2.set_title('Residual Plot')
    
    # Add best fit line to residuals
    z = np.polyfit(y_pred, residuals, 1)
    p = np.poly1d(z)
    ax2.plot(np.sort(y_pred), p(np.sort(y_pred)), "k--", linewidth=1.5, 
             label=f"Trend: y={z[0]:.2e}x{z[1]:+.2e}")
    ax2.legend()