# 🤖 Machine Learning & Predictive Modeling - Zillow Home Value Index

This notebook demonstrates the creation and evaluation of real estate price prediction models using advanced machine learning techniques.

## Objectives:
- Develop and compare multiple ML algorithms
- Predict future real estate prices
- Analyze feature importance and drivers
- Evaluate model performance with cross-validation
- Create production-ready predictive models


In [None]:
# Import custom modules
import sys
sys.path.append('../src')

from data_processing import ZillowDataProcessor
from visualization import ZillowVisualizer
from models import ZillowPredictor

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)


## 1. Data Preparation for Machine Learning


In [None]:
# Load and prepare data for modeling
processor = ZillowDataProcessor('../data/Zillow_Home_Value_Index_sample.csv')
df = processor.load_data()
df_clean = processor.clean_data(fill_method='interpolate', min_non_null_ratio=0.8)
df_metrics = processor.calculate_growth_metrics(df_clean)

print(f"✅ Data prepared for modeling: {len(df_metrics)} regions")
print(f"📊 Available columns: {list(df_metrics.columns)}")


## 2. Feature Engineering & Model Training


In [None]:
# Initialize the predictor and prepare features
predictor = ZillowPredictor()

# Get price columns (all date columns)
price_columns = [col for col in df_metrics.columns if col.startswith('20')]
print(f"📅 Price columns found: {len(price_columns)} months")

# Prepare features for latest price prediction
X, y = predictor.prepare_features(df_metrics, price_columns, target_period='latest')

# Display feature information
print(f"\n📊 Feature Matrix Shape: {X.shape}")
print(f"🎯 Target Variable Shape: {y.shape}")
print(f"📈 Features: {list(X.columns)}")


In [None]:
# Train multiple models and compare performance
results = predictor.train_models(X, y, test_size=0.2, random_state=42)

# Display model performance summary
print("\n🏆 Model Performance Summary:")
print("=" * 50)
for model_name, result in results.items():
    print(f"{model_name:20} | R²: {result['r2']:.4f} | RMSE: {result['rmse']:.0f} | MAE: {result['mae']:.0f}")


## 3. Model Evaluation & Visualization


In [None]:
# Create comprehensive model comparison visualizations
predictor.plot_model_comparison(save_path='../results/visualizations/model_comparison.png')


In [None]:
# Analyze feature importance for the best performing model
predictor.plot_feature_importance(save_path='../results/visualizations/feature_importance.png')
