In [3]:
# House Price Prediction Model
# Model Building Notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# 1. LOAD DATASET
print("Loading dataset...")
df = pd.read_csv('Housing.csv')
print(f"Dataset shape: {df.shape}")

# 2. FEATURE SELECTION
# Selecting 6 features from the recommended 9
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 
                     'GarageCars', 'YearBuilt', 'Neighborhood']
target = 'SalePrice'

# Create working dataframe
data = df[selected_features + [target]].copy()
print(f"\nSelected features: {selected_features}")
print(f"Working dataset shape: {data.shape}")

# 3. DATA PREPROCESSING

# 3a. Handle Missing Values
print("\n--- Handling Missing Values ---")
print(f"Missing values before:\n{data.isnull().sum()}")

# Fill numeric columns with median
numeric_cols = ['TotalBsmtSF', 'GarageCars']
for col in numeric_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].median(), inplace=True)

# Fill categorical with mode
if data['Neighborhood'].isnull().sum() > 0:
    data['Neighborhood'].fillna(data['Neighborhood'].mode()[0], inplace=True)

print(f"\nMissing values after:\n{data.isnull().sum()}")

# 3b. Encode Categorical Variables
print("\n--- Encoding Categorical Variables ---")
le = LabelEncoder()
data['Neighborhood_Encoded'] = le.fit_transform(data['Neighborhood'])
print(f"Neighborhood encoded: {len(le.classes_)} unique categories")

# Drop original categorical column
data.drop('Neighborhood', axis=1, inplace=True)

# 3c. Feature Scaling (Random Forest doesn't require scaling, but including for completeness)
# For Random Forest, we won't scale as it's tree-based
# If using Linear Regression or SVR, uncomment the scaling code below:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# feature_cols = [col for col in data.columns if col != target]
# data[feature_cols] = scaler.fit_transform(data[feature_cols])

# 4. PREPARE DATA FOR TRAINING
print("\n--- Preparing Data for Training ---")
X = data.drop(target, axis=1)
y = data[target]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# 5. TRAIN MODEL
print("\n--- Training Random Forest Model ---")
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model training completed!")

# 6. MODEL EVALUATION
print("\n--- Model Evaluation ---")

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Display results
print("\nTraining Set Metrics:")
print(f"  MAE:  ${train_mae:,.2f}")
print(f"  MSE:  ${train_mse:,.2f}")
print(f"  RMSE: ${train_rmse:,.2f}")
print(f"  R²:   {train_r2:.4f}")

print("\nTest Set Metrics:")
print(f"  MAE:  ${test_mae:,.2f}")
print(f"  MSE:  ${test_mse:,.2f}")
print(f"  RMSE: ${test_rmse:,.2f}")
print(f"  R²:   {test_r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n--- Feature Importance ---")
print(feature_importance)

# 7. SAVE MODEL
print("\n--- Saving Model ---")

# Save model and label encoder
model_data = {
    'model': model,
    'label_encoder': le,
    'feature_columns': list(X.columns),
    'metrics': {
        'test_mae': test_mae,
        'test_rmse': test_rmse,
        'test_r2': test_r2
    }
}

joblib.dump(model_data, 'house_price_model.pkl')
print("Model saved successfully as 'house_price_model.pkl'")

# 8. VERIFY MODEL LOADING
print("\n--- Verifying Model Load ---")
loaded_data = joblib.load('house_price_model.pkl')
loaded_model = loaded_data['model']

# Test prediction
sample_prediction = loaded_model.predict(X_test[:5])
print(f"\nSample predictions from loaded model:")
for i, (pred, actual) in enumerate(zip(sample_prediction, y_test[:5])):
    print(f"  House {i+1}: Predicted=${pred:,.2f}, Actual=${actual:,.2f}")

print("\n✓ Model development completed successfully!")

Loading dataset...
Dataset shape: (1460, 81)

Selected features: ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
Working dataset shape: (1460, 7)

--- Handling Missing Values ---
Missing values before:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64

Missing values after:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64

--- Encoding Categorical Variables ---
Neighborhood encoded: 25 unique categories

--- Preparing Data for Training ---
Features shape: (1460, 6)
Target shape: (1460,)

Feature columns: ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood_Encoded']

Training set size: 1168
Test set size: 292

--- Training Random Forest Model ---
Model training completed!

--- Model Evaluation ---

Training Set Metrics:
  MAE:  $10,408.87
 