# Module 03 — Housing Price Predictions
## CSE 450 Machine Learning | Team 8

**Team Members**: Dawson, Peter, Tanner

**Objective**: Predict King County house prices using XGBoost regression

---

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import (mean_squared_error, mean_absolute_error,
                             r2_score, mean_absolute_percentage_error)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

print('All imports loaded successfully.')

## 2. Load Data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
holdout = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test.csv')
mini_holdout = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv')

print(f'Training:     {df.shape[0]:,} rows x {df.shape[1]} cols')
print(f'Holdout:      {holdout.shape[0]:,} rows x {holdout.shape[1]} cols')
print(f'Mini holdout: {mini_holdout.shape[0]:,} rows x {mini_holdout.shape[1]} cols')

## 3. Data Exploration

In [None]:
df.info()

In [None]:
df.describe().round(2)

In [None]:
# Missing values
missing = df.isnull().sum()
print('Missing values:')
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print('None found \u2014 dataset is clean.')

In [None]:
# Target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['price'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_title('House Price Distribution', fontsize=14)
axes[0].set_xlabel('Price ($)')
axes[0].axvline(df['price'].mean(), color='red', linestyle='--',
                label=f'Mean: ${df["price"].mean():,.0f}')
axes[0].axvline(df['price'].median(), color='orange', linestyle='--',
                label=f'Median: ${df["price"].median():,.0f}')
axes[0].legend()

axes[1].hist(np.log1p(df['price']), bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[1].set_title('Log(Price) Distribution', fontsize=14)
axes[1].set_xlabel('Log(Price)')

plt.tight_layout()
plt.show()
print(f'Price range: ${df["price"].min():,.0f} to ${df["price"].max():,.0f}')
print(f'Skewness: {df["price"].skew():.2f}')

In [None]:
# Correlations with price
corr = df.select_dtypes(include=[np.number]).corr()['price'].sort_values(ascending=False)
print('Correlation with price:')
print(corr.to_string())

In [None]:
# Correlation heatmap \u2014 top features
top_feats = corr.abs().sort_values(ascending=False).head(12).index
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df[top_feats].corr(), annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, ax=ax, square=True)
ax.set_title('Correlation Matrix \u2014 Top 12 Features', fontsize=14)
plt.tight_layout()
plt.show()

## 4. Preprocessing & Feature Engineering

In [None]:
def preprocess(data, is_training=True, train_columns=None):
    """
    Apply identical preprocessing to training and holdout data.
    Uses zipcode + lat + long for location (all 3 outperform subsets).
    """
    df = data.copy()

    # --- Parse date ---
    df['date'] = pd.to_datetime(df['date'])
    df['year_sold'] = df['date'].dt.year
    df['month_sold'] = df['date'].dt.month

    # --- Age features ---
    df['age'] = df['year_sold'] - df['yr_built']
    df['renovated'] = (df['yr_renovated'] > 0).astype(int)
    df['years_since_renovation'] = np.where(
        df['yr_renovated'] > 0,
        df['year_sold'] - df['yr_renovated'],
        df['age']
    )

    # --- Derived features ---
    df['has_basement'] = (df['sqft_basement'] > 0).astype(int)
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['living_lot_ratio'] = df['sqft_living'] / df['sqft_lot'].clip(lower=1)
    df['sqft_per_room'] = df['sqft_living'] / df['total_rooms'].clip(lower=1)
    df['above_ground_ratio'] = df['sqft_above'] / df['sqft_living'].clip(lower=1)

    # --- Drop raw columns we replaced ---
    drop_cols = ['id', 'date', 'yr_built', 'yr_renovated']
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    # --- Return ---
    if is_training:
        y = df.pop('price')
        return df, y
    else:
        if 'price' in df.columns:
            df = df.drop(columns=['price'])
        if train_columns is not None:
            for col in train_columns:
                if col not in df.columns:
                    df[col] = 0
            df = df[train_columns]
        return df

print('Preprocessing function defined.')

In [None]:
# Apply to training data
X, y = preprocess(df, is_training=True)

print(f'Features: {X.shape[1]} columns')
print(f'Target: {len(y):,} values')
print(f'\nFeature list:\n{list(X.columns)}')

In [None]:
# Sanity checks
print('Missing values after preprocessing:')
miss = X.isnull().sum()
print(miss[miss > 0] if miss.sum() > 0 else 'None')
print(f'\nInfinite values: {np.isinf(X.select_dtypes(include=[np.number])).sum().sum()}')

## 5. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f'Training set: {X_train.shape[0]:,} rows')
print(f'Test set:     {X_test.shape[0]:,} rows')

In [None]:
def evaluate(model, X_test, y_test, name='Model'):
    """Evaluate a model and return metrics dict."""
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    mape = mean_absolute_percentage_error(y_test, preds) * 100

    print(f'{name}:')
    print(f'  RMSE:  ${rmse:,.2f}')
    print(f'  MAE:   ${mae:,.2f}')
    print(f'  R2:    {r2:.4f}')
    print(f'  MAPE:  {mape:.2f}%')
    print()
    return {
        'model': name, 'rmse': rmse, 'mae': mae,
        'r2': r2, 'mape': mape, 'preds': preds
    }

results = {}
print('Evaluation function defined.')

## 6. Model Training

In [None]:
# Baseline: predict mean price
class MeanBaseline:
    def fit(self, X, y):
        self.mean_ = y.mean()
        return self
    def predict(self, X):
        return np.full(len(X), self.mean_)

baseline = MeanBaseline().fit(X_train, y_train)
results['Baseline'] = evaluate(baseline, X_test, y_test, 'Baseline (Mean)')

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
results['Linear'] = evaluate(lr, X_test, y_test, 'Linear Regression')

In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
results['RF'] = evaluate(rf, X_test, y_test, 'Random Forest')

In [None]:
# Gradient Boosting (sklearn)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
results['GB'] = evaluate(gb, X_test, y_test, 'Gradient Boosting')

In [None]:
# XGBoost (default hyperparameters)
xgb_default = XGBRegressor(
    n_estimators=500, learning_rate=0.1, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    random_state=42, n_jobs=-1
)
xgb_default.fit(X_train, y_train)
results['XGB_default'] = evaluate(xgb_default, X_test, y_test, 'XGBoost (default)')

## 7. Hyperparameter Tuning (XGBoost)

GridSearchCV to find the best XGBoost parameters. This may take a few minutes.

In [None]:
param_grid = {
    'n_estimators': [300, 500, 800],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [3, 5],
}

print(f'Grid search: {np.prod([len(v) for v in param_grid.values()])} combinations x 3 folds')
print('Training...')

xgb_grid = GridSearchCV(
    XGBRegressor(random_state=42, n_jobs=-1),
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)
xgb_grid.fit(X_train, y_train)

print(f'\nBest parameters: {xgb_grid.best_params_}')
print(f'Best CV RMSE: ${np.sqrt(-xgb_grid.best_score_):,.2f}')

In [None]:
# Evaluate the tuned XGBoost
best_xgb = xgb_grid.best_estimator_
results['XGB_tuned'] = evaluate(best_xgb, X_test, y_test, 'XGBoost (tuned)')

## 8. Cross-Validation

In [None]:
# 5-Fold Cross-Validation on the best model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_xgb, X, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
cv_rmse = np.sqrt(-cv_scores)

print('5-Fold Cross-Validation Results:')
print(f'  RMSE scores: {[f"${x:,.0f}" for x in cv_rmse]}')
print(f'  Mean RMSE:   ${cv_rmse.mean():,.2f}')
print(f'  Std RMSE:    ${cv_rmse.std():,.2f}')
print(f'  CV:          {cv_rmse.std()/cv_rmse.mean()*100:.1f}%')

## 9. Feature Importance

In [None]:
# Feature importance from tuned XGBoost
importances = pd.Series(best_xgb.feature_importances_, index=X.columns)
top_n = 15
top_features = importances.nlargest(top_n)

fig, ax = plt.subplots(figsize=(10, 8))
top_features.sort_values().plot(kind='barh', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Feature Importance (Gain)', fontsize=12)
ax.set_title(f'Top {top_n} Price Drivers in House Price Predictions', fontsize=14)
ax.tick_params(axis='y', labelsize=11)
plt.tight_layout()
plt.show()

print('\nTop 7 Price Drivers:')
for i, (feat, imp) in enumerate(importances.nlargest(7).items(), 1):
    print(f'  {i}. {feat}: importance = {imp:.4f}')

## 10. Model Comparison

In [None]:
# Compare all models side by side
comparison = pd.DataFrame([
    {k: v for k, v in r.items() if k != 'preds'}
    for r in results.values()
]).set_index('model')

print(comparison.to_string())
print()
best_model_name = comparison['rmse'].idxmin()
print(f'Best model: {best_model_name} (lowest RMSE)')

In [None]:
# Model comparison chart
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
models = comparison.index.tolist()
colors = plt.cm.Set2(np.linspace(0, 1, len(models)))

axes[0].barh(models, comparison['rmse'], color=colors, edgecolor='black')
axes[0].set_xlabel('RMSE ($)')
axes[0].set_title('RMSE (lower = better)')

axes[1].barh(models, comparison['mae'], color=colors, edgecolor='black')
axes[1].set_xlabel('MAE ($)')
axes[1].set_title('MAE (lower = better)')

axes[2].barh(models, comparison['r2'], color=colors, edgecolor='black')
axes[2].set_xlabel('R-squared')
axes[2].set_title('R-squared (higher = better)')
axes[2].set_xlim(0, 1)

plt.suptitle('Model Performance Comparison', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 11. Visualizations

In [None]:
# Predicted vs Actual
preds_test = best_xgb.predict(X_test)

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(y_test / 1e6, preds_test / 1e6, alpha=0.3, s=8, color='#2E86AB')
lims = [0, max(y_test.max(), preds_test.max()) / 1e6 * 1.05]
ax.plot(lims, lims, 'r--', linewidth=2, alpha=0.8, label='Perfect Prediction')
ax.set_xlabel('Actual Price (Millions $)', fontsize=12)
ax.set_ylabel('Predicted Price (Millions $)', fontsize=12)
ax.set_title('Predicted vs Actual House Prices', fontsize=14, fontweight='bold')
ax.set_xlim(lims)
ax.set_ylim(lims)
ax.set_aspect('equal')
ax.legend(fontsize=11)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

r2_val = r2_score(y_test, preds_test)
rmse_val = np.sqrt(mean_squared_error(y_test, preds_test))
ax.text(0.05, 0.92, f'R-squared = {r2_val:.4f}\nRMSE = ${rmse_val:,.0f}',
        transform=ax.transAxes, fontsize=12,
        bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.9))

plt.tight_layout()
plt.show()

In [None]:
# Residual distribution
residuals = y_test - preds_test

fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(residuals / 1000, bins=60, edgecolor='black', alpha=0.7, color='#2E86AB')
ax.axvline(0, color='red', linestyle='--', linewidth=2)
ax.set_xlabel('Prediction Error (Thousands $)', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Prediction Errors', fontsize=14, fontweight='bold')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.text(0.72, 0.85, f'Mean error: ${residuals.mean():,.0f}\nMedian error: ${residuals.median():,.0f}',
        transform=ax.transAxes, fontsize=11,
        bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.9))

plt.tight_layout()
plt.show()

## 12. Holdout Predictions

**CRITICAL**: Filename must be `team8-module3-predictions.csv` (DASHES, not underscores!)

In [None]:
# Preprocess holdout data using same function
train_columns = X.columns.tolist()
holdout_X = preprocess(holdout, is_training=False, train_columns=train_columns)

print(f'Holdout features: {holdout_X.shape}')
print(f'Training features: {X.shape}')
print(f'Columns match: {list(holdout_X.columns) == train_columns}')

In [None]:
# Generate predictions and save
holdout_preds = best_xgb.predict(holdout_X)

submission = pd.DataFrame({'price': holdout_preds})
submission.to_csv('team8-module3-predictions.csv', index=False)

# VERIFY
check = pd.read_csv('team8-module3-predictions.csv')
print('=== HOLDOUT VERIFICATION ===')
print(f'Filename:        team8-module3-predictions.csv')
print(f'Columns:         {list(check.columns)}')
print(f'Rows:            {len(check)}')
print(f'All same value?  {check["price"].nunique() == 1}  (should be False!)')
print(f'Min prediction:  ${check["price"].min():,.2f}')
print(f'Max prediction:  ${check["price"].max():,.2f}')
print(f'Mean prediction: ${check["price"].mean():,.2f}')
print()
print('First 10 predictions:')
print(check.head(10).to_string())

## 13. Mini Holdout Check (PM Checkin)

In [None]:
# Preprocess mini holdout
mini_X = preprocess(mini_holdout, is_training=False, train_columns=train_columns)
print(f'Mini holdout features: {mini_X.shape}')

# Generate predictions
mini_preds = best_xgb.predict(mini_X)

# Save
mini_sub = pd.DataFrame({'price': mini_preds})
mini_sub.to_csv('team8-module3-mini-predictions.csv', index=False)

# Verify
check_mini = pd.read_csv('team8-module3-mini-predictions.csv')
print(f'Saved: team8-module3-mini-predictions.csv')
print(f'Rows: {len(check_mini)}')
print(f'Sample predictions: {check_mini["price"].head().tolist()}')