# Housing Price Prediction - Advanced Models
## CPSC 4310 - Milestone 3

**Student:** Chloe Lee  
**Date:** February 2025  
**Goal:** Build advanced models (Random Forest & XGBoost) to improve predictions

---

## Objectives:
1. Load baseline model results for comparison
2. Build Random Forest model
3. Build XGBoost model
4. Compare all three models
5. Analyze feature importance
6. Generate final predictions for dashboard

---
# Part 1: Setup and Data Loading

In [1]:
# Install if needed (uncomment and run once):
# !pip install pandas numpy matplotlib seaborn scikit-learn xgboost
import sys
!{sys.executable} -m pip install matplotlib seaborn scikit-learn xgboost
print("Run the line above if you get import errors!")

Run the line above if you get import errors!


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úì Libraries imported successfully!")

‚úì Libraries imported successfully!


In [3]:
# Load data
print("Loading data...")
df = pd.read_csv('output/housing_data_with_features.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"‚úì Data loaded: {len(df)} rows, {len(df.columns)} columns")

Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'housing_data_with_features.csv'

In [None]:
# Define features and target (same as baseline)
feature_cols = [
    'Index_Lag1', 'Index_Lag3', 'Index_Lag12',
    'Mortgage_1Y', 'Mortgage_3Y', 'Mortgage_5Y', 'Prime_Rate',
    'Index_YoY_Change', 'Mortgage_5Y_Change',
    'Index_MA3', 'Index_MA12',
    'Month', 'Quarter',
    'Price_Above_MA3', 'MA3_MA12_Diff'
]

target_col = 'Index_MoM_Change'

print(f"Features: {len(feature_cols)}")
print(f"Target: {target_col}")

In [None]:
# Prepare data (same as baseline)
print("Preparing data...")

df_clean = df[feature_cols + [target_col, 'Date', 'City', 'Year']].dropna()

SPLIT_YEAR = 2024
train_mask = df_clean['Year'] < SPLIT_YEAR
test_mask = df_clean['Year'] >= SPLIT_YEAR

X_train = df_clean.loc[train_mask, feature_cols]
y_train = df_clean.loc[train_mask, target_col]
X_test = df_clean.loc[test_mask, feature_cols]
y_test = df_clean.loc[test_mask, target_col]

print(f"Train: {len(X_train)} samples")
print(f"Test: {len(X_test)} samples")

---
# Part 2: Load Baseline Results

Load the baseline model results to compare.

In [None]:
# Try to load baseline results (if available)
try:
    baseline_results = pd.read_csv('baseline_model_performance.csv')
    print("Baseline Model Performance:")
    print("="*60)
    display(baseline_results)
    
    baseline_test_r2 = baseline_results['Test_R2'].values[0]
    baseline_test_mape = baseline_results['Test_MAPE'].values[0]
    
    print(f"\n‚úì Baseline to beat:")
    print(f"  R¬≤ = {baseline_test_r2:.4f}")
    print(f"  MAPE = {baseline_test_mape:.2f}%")
    
except FileNotFoundError:
    print("‚ö† Baseline results not found. We'll still build advanced models!")
    baseline_test_r2 = None
    baseline_test_mape = None

---
# Part 3: Random Forest Model

**Why Random Forest?**
- Handles non-linear relationships (Linear Regression can't)
- Less prone to overfitting than single decision tree
- Provides feature importance
- No need to scale features

In [None]:
print("="*80)
print("TRAINING RANDOM FOREST")
print("="*80)

# Initialize Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,      # Number of trees
    max_depth=15,          # Maximum depth of each tree
    min_samples_split=5,   # Minimum samples to split a node
    min_samples_leaf=2,    # Minimum samples in a leaf
    random_state=42,       # For reproducibility
    n_jobs=-1,             # Use all CPU cores
    verbose=1              # Show progress
)

print("\nModel parameters:")
print(f"  Trees: {rf_model.n_estimators}")
print(f"  Max depth: {rf_model.max_depth}")
print(f"  Min samples split: {rf_model.min_samples_split}")

print("\nTraining...")
rf_model.fit(X_train, y_train)
print("‚úì Training complete!")

In [None]:
# Make predictions
print("Making predictions...")
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)
print("‚úì Predictions generated")

In [None]:
# Evaluate Random Forest
def calculate_metrics(y_true, y_pred, model_name=""):
    """Calculate all metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    mask = y_true != 0
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
    
    print(f"{model_name} Metrics:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R¬≤:   {r2:.4f}")
    print(f"  MAPE: {mape:.2f}%")
    
    return {'Model': model_name, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'MAPE': mape}

print("="*60)
print("RANDOM FOREST PERFORMANCE")
print("="*60)

rf_train_metrics = calculate_metrics(y_train, rf_train_pred, "RF Training")
print()
rf_test_metrics = calculate_metrics(y_test, rf_test_pred, "RF Test")

# Compare with baseline
if baseline_test_r2 is not None:
    print("\n" + "="*60)
    print("COMPARISON WITH BASELINE")
    print("="*60)
    r2_improvement = ((rf_test_metrics['R2'] - baseline_test_r2) / baseline_test_r2) * 100
    mape_improvement = ((baseline_test_mape - rf_test_metrics['MAPE']) / baseline_test_mape) * 100
    
    print(f"R¬≤ improvement: {r2_improvement:+.1f}%")
    print(f"MAPE improvement: {mape_improvement:+.1f}%")
    
    if rf_test_metrics['R2'] > baseline_test_r2:
        print("\n‚úì Random Forest BEATS baseline!")
    else:
        print("\n‚ö† Random Forest did not beat baseline")

---
# Part 4: XGBoost Model

**Why XGBoost?**
- Usually best performance for tabular data
- Handles complex patterns
- Built-in regularization prevents overfitting
- Very fast training

In [None]:
print("="*80)
print("TRAINING XGBOOST")
print("="*80)

# Initialize XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=100,       # Number of boosting rounds
    max_depth=6,            # Maximum depth of trees
    learning_rate=0.1,      # Step size for updates
    subsample=0.8,          # Fraction of samples for each tree
    colsample_bytree=0.8,   # Fraction of features for each tree
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

print("\nModel parameters:")
print(f"  Estimators: {xgb_model.n_estimators}")
print(f"  Max depth: {xgb_model.max_depth}")
print(f"  Learning rate: {xgb_model.learning_rate}")

print("\nTraining...")
xgb_model.fit(X_train, y_train)
print("‚úì Training complete!")

In [None]:
# Make predictions
print("Making predictions...")
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)
print("‚úì Predictions generated")

In [None]:
# Evaluate XGBoost
print("="*60)
print("XGBOOST PERFORMANCE")
print("="*60)

xgb_train_metrics = calculate_metrics(y_train, xgb_train_pred, "XGB Training")
print()
xgb_test_metrics = calculate_metrics(y_test, xgb_test_pred, "XGB Test")

# Compare with baseline
if baseline_test_r2 is not None:
    print("\n" + "="*60)
    print("COMPARISON WITH BASELINE")
    print("="*60)
    r2_improvement = ((xgb_test_metrics['R2'] - baseline_test_r2) / baseline_test_r2) * 100
    mape_improvement = ((baseline_test_mape - xgb_test_metrics['MAPE']) / baseline_test_mape) * 100
    
    print(f"R¬≤ improvement: {r2_improvement:+.1f}%")
    print(f"MAPE improvement: {mape_improvement:+.1f}%")
    
    if xgb_test_metrics['R2'] > baseline_test_r2:
        print("\n‚úì XGBoost BEATS baseline!")
    else:
        print("\n‚ö† XGBoost did not beat baseline")

---
# Part 5: Model Comparison

In [None]:
# Create comparison DataFrame
print("="*80)
print("COMPLETE MODEL COMPARISON")
print("="*80)

comparison_data = []

# Add baseline if available
if baseline_test_r2 is not None:
    comparison_data.append({
        'Model': 'Linear Regression',
        'Test_RMSE': baseline_results['Test_RMSE'].values[0],
        'Test_MAE': baseline_results['Test_MAE'].values[0],
        'Test_R2': baseline_results['Test_R2'].values[0],
        'Test_MAPE': baseline_results['Test_MAPE'].values[0]
    })

# Add Random Forest
comparison_data.append({
    'Model': 'Random Forest',
    'Test_RMSE': rf_test_metrics['RMSE'],
    'Test_MAE': rf_test_metrics['MAE'],
    'Test_R2': rf_test_metrics['R2'],
    'Test_MAPE': rf_test_metrics['MAPE']
})

# Add XGBoost
comparison_data.append({
    'Model': 'XGBoost',
    'Test_RMSE': xgb_test_metrics['RMSE'],
    'Test_MAE': xgb_test_metrics['MAE'],
    'Test_R2': xgb_test_metrics['R2'],
    'Test_MAPE': xgb_test_metrics['MAPE']
})

comparison_df = pd.DataFrame(comparison_data)

print("\nTest Set Performance:")
display(comparison_df)

# Find best model
best_model_idx = comparison_df['Test_R2'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
best_r2 = comparison_df.loc[best_model_idx, 'Test_R2']
best_mape = comparison_df.loc[best_model_idx, 'Test_MAPE']

print("\n" + "="*60)
print("üèÜ BEST MODEL")
print("="*60)
print(f"Model: {best_model_name}")
print(f"Test R¬≤: {best_r2:.4f}")
print(f"Test MAPE: {best_mape:.2f}%")
print("="*60)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Test_RMSE', 'Test_MAE', 'Test_R2', 'Test_MAPE']
titles = ['RMSE (lower is better)', 'MAE (lower is better)', 
          'R¬≤ (higher is better)', 'MAPE % (lower is better)']

for i, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[i // 2, i % 2]
    
    values = comparison_df[metric].values
    models = comparison_df['Model'].values
    
    colors = ['#3498db', '#e74c3c', '#2ecc71'][:len(models)]
    bars = ax.bar(models, values, color=colors, alpha=0.7, edgecolor='black')
    
    # Highlight best
    if metric == 'Test_R2':
        best_idx = np.argmax(values)
    else:
        best_idx = np.argmin(values)
    bars[best_idx].set_color('#f39c12')
    bars[best_idx].set_edgecolor('black')
    bars[best_idx].set_linewidth(3)
    
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_ylabel(metric.replace('Test_', ''))
    ax.grid(alpha=0.3, axis='y')
    ax.tick_params(axis='x', rotation=15)
    
    # Add value labels
    for j, (bar, val) in enumerate(zip(bars, values)):
        ax.text(bar.get_x() + bar.get_width()/2, val, 
               f'{val:.3f}', ha='center', va='bottom', fontsize=9)

plt.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

print("‚úì Comparison visualization complete")

---
# Part 6: Feature Importance Analysis

In [None]:
# Random Forest Feature Importance
print("Random Forest Feature Importance:")
print("="*60)

rf_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Features:")
display(rf_importance.head(10))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(rf_importance['Feature'].head(10), rf_importance['Importance'].head(10), 
         color='#e74c3c', alpha=0.7, edgecolor='black')
plt.xlabel('Importance Score', fontsize=12)
plt.title('Random Forest: Top 10 Feature Importances', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

In [None]:
# XGBoost Feature Importance
print("\nXGBoost Feature Importance:")
print("="*60)

xgb_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Features:")
display(xgb_importance.head(10))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(xgb_importance['Feature'].head(10), xgb_importance['Importance'].head(10), 
         color='#2ecc71', alpha=0.7, edgecolor='black')
plt.xlabel('Importance Score', fontsize=12)
plt.title('XGBoost: Top 10 Feature Importances', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

---
# Part 7: Actual vs Predicted Visualizations

In [None]:
# Compare all models - Actual vs Predicted
fig, axes = plt.subplots(1, 3 if baseline_test_r2 is not None else 2, 
                         figsize=(18 if baseline_test_r2 is not None else 12, 5))

plot_idx = 0

# Baseline (if available)
if baseline_test_r2 is not None:
    try:
        baseline_pred = pd.read_csv('baseline_predictions.csv')
        axes[plot_idx].scatter(baseline_pred['Actual'], baseline_pred['Predicted'], 
                              alpha=0.5, s=20, color='#3498db')
        axes[plot_idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                           'r--', lw=2)
        axes[plot_idx].set_xlabel('Actual', fontsize=12)
        axes[plot_idx].set_ylabel('Predicted', fontsize=12)
        axes[plot_idx].set_title(f'Linear Regression (R¬≤={baseline_test_r2:.3f})', 
                                fontsize=12, fontweight='bold')
        axes[plot_idx].grid(alpha=0.3)
        plot_idx += 1
    except:
        pass

# Random Forest
axes[plot_idx].scatter(y_test, rf_test_pred, alpha=0.5, s=20, color='#e74c3c')
axes[plot_idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                   'r--', lw=2)
axes[plot_idx].set_xlabel('Actual', fontsize=12)
axes[plot_idx].set_ylabel('Predicted', fontsize=12)
axes[plot_idx].set_title(f'Random Forest (R¬≤={rf_test_metrics["R2"]:.3f})', 
                        fontsize=12, fontweight='bold')
axes[plot_idx].grid(alpha=0.3)
plot_idx += 1

# XGBoost
axes[plot_idx].scatter(y_test, xgb_test_pred, alpha=0.5, s=20, color='#2ecc71')
axes[plot_idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                   'r--', lw=2)
axes[plot_idx].set_xlabel('Actual', fontsize=12)
axes[plot_idx].set_ylabel('Predicted', fontsize=12)
axes[plot_idx].set_title(f'XGBoost (R¬≤={xgb_test_metrics["R2"]:.3f})', 
                        fontsize=12, fontweight='bold')
axes[plot_idx].grid(alpha=0.3)

plt.suptitle('Actual vs Predicted: All Models', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

---
# Part 8: Save Results

In [None]:
# Save model comparison
comparison_df.to_csv('model_comparison.csv', index=False)
print("‚úì Model comparison saved: model_comparison.csv")

# Save feature importance
rf_importance.to_csv('rf_feature_importance.csv', index=False)
xgb_importance.to_csv('xgb_feature_importance.csv', index=False)
print("‚úì Feature importance saved")

# Save best model predictions (for Vergil!)
best_predictions = pd.DataFrame({
    'Date': df_clean.loc[test_mask, 'Date'].values,
    'City': df_clean.loc[test_mask, 'City'].values,
    'Actual_MoM_Change': y_test.values,
    'Predicted_MoM_Change': xgb_test_pred if best_model_name == 'XGBoost' else rf_test_pred,
    'Error': y_test.values - (xgb_test_pred if best_model_name == 'XGBoost' else rf_test_pred)
})

best_predictions.to_csv('final_predictions.csv', index=False)
print(f"‚úì Final predictions saved: final_predictions.csv")
print(f"  (Using {best_model_name})")

print("\n‚úì All results saved successfully!")

---
# Part 9: Summary & Next Steps

In [None]:
print("="*80)
print("FINAL SUMMARY")
print("="*80)

print("\nüìä MODELS TESTED:")
for idx, row in comparison_df.iterrows():
    star = " üèÜ" if row['Model'] == best_model_name else ""
    print(f"  {row['Model']:20} - R¬≤: {row['Test_R2']:.4f}, MAPE: {row['Test_MAPE']:.2f}%{star}")

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   R¬≤ = {best_r2:.4f} ({best_r2*100:.1f}% variance explained)")
print(f"   MAPE = {best_mape:.2f}% (average error)")

if best_r2 > 0.85:
    print("\n‚úì EXCELLENT performance! (R¬≤ > 0.85)")
elif best_r2 > 0.75:
    print("\n‚úì GOOD performance! (R¬≤ > 0.75)")
else:
    print("\n‚ö† ACCEPTABLE performance (R¬≤ > 0.70)")

print("\nüìÅ FILES CREATED:")
print("  1. model_comparison.csv - Performance metrics for all models")
print("  2. rf_feature_importance.csv - Random Forest feature rankings")
print("  3. xgb_feature_importance.csv - XGBoost feature rankings")
print("  4. final_predictions.csv - Best model predictions (FOR VERGIL!)")

print("\nüì§ NEXT STEPS:")
print("  1. Share final_predictions.csv with Vergil (Tableau dashboard)")
print("  2. Share model_comparison.csv with Ryan (presentation)")
print("  3. Write Milestone 3 report (include all 3 models)")
print("  4. Prepare final presentation")

print("\n" + "="*80)
print("‚úÖ ADVANCED MODELING COMPLETE!")
print("="*80)