# Phase 3: Linear Regression Model Building

**Project**: House Price Prediction and Analysis Using King County Housing Data

**Team**: Ashwin, Ashwath, Namrata Mane

**Course**: DA 591 - Final Semester Project

In this phase, we build a Linear Regression model using the features selected from our EDA (Phase 2).

## Step 1: Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load the cleaned dataset
df = pd.read_csv('cleaned_house_data.csv')
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Load selected features from EDA
with open('selected_features.json', 'r') as f:
    selected_features = json.load(f)

print(f"\nSelected features ({len(selected_features)}):")
for i, feat in enumerate(selected_features, 1):
    print(f"  {i}. {feat}")

## Step 2: Prepare Data for Training

In [None]:
# Separate features (X) and target (y)
X = df[selected_features]
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Testing set:  {X_test.shape[0]} samples")

## Step 3: Train Linear Regression Model

In [None]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Intercept: ${model.intercept_:,.2f}")

## Step 4: Evaluate Model Performance

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model Performance on Test Set:")
print("=" * 40)
print(f"R² Score:  {r2:.4f}  ({r2*100:.1f}% of price variation explained)")
print(f"MAE:       ${mae:,.2f}  (average prediction is off by this much)")
print(f"RMSE:      ${rmse:,.2f}  (penalizes large errors more)")

print(f"\nAverage house price: ${y_test.mean():,.2f}")
print(f"MAE as % of avg price: {mae/y_test.mean()*100:.1f}%")

## Step 5: Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.3, s=10)
# draw a diagonal line (perfect prediction would fall on this line)
max_val = max(y_test.max(), y_pred.max())
axes[0].plot([0, max_val], [0, max_val], 'r--', label='Perfect prediction')
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title('Actual vs Predicted Prices', fontweight='bold')
axes[0].legend()

# Plot 2: Residuals (prediction errors)
residuals = y_test - y_pred
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--', label='Zero error')
axes[1].set_xlabel('Prediction Error ($)')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Prediction Errors', fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Most predictions are within ${np.percentile(np.abs(residuals), 75):,.0f} of the actual price (75th percentile)")

## Step 6: Feature Coefficients

In [None]:
# How much each feature affects the predicted price
coefficients = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("Feature Coefficients (how much each feature affects price):")
print("=" * 55)
for _, row in coefficients.iterrows():
    direction = 'increases' if row['Coefficient'] > 0 else 'decreases'
    print(f"  {row['Feature']:20s}: ${row['Coefficient']:>12,.2f}  ({direction} price)")

# Visualize
plt.figure(figsize=(10, 6))
colors = ['green' if c > 0 else 'red' for c in coefficients['Coefficient']]
plt.barh(coefficients['Feature'], coefficients['Coefficient'], color=colors, edgecolor='black', alpha=0.7)
plt.xlabel('Coefficient Value ($)')
plt.title('How Each Feature Affects House Price', fontweight='bold')
plt.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

## Step 7: Save the Model

In [None]:
# Save the trained model
joblib.dump(model, 'linear_regression_model.pkl')
print("Model saved to 'linear_regression_model.pkl'")

# Summary
print("\n" + "=" * 50)
print("MODEL BUILDING SUMMARY")
print("=" * 50)
print(f"Model: Linear Regression")
print(f"Features used: {len(selected_features)}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"R² Score: {r2:.4f}")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")