# House Price Prediction System - Model Development

This notebook loads, preprocesses, trains, and saves a house price prediction model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load Dataset

In [None]:
# Load the dataset
# Note: Download the dataset from Kaggle and place train.csv in the model directory
df = pd.read_csv('train.csv')
print(f"Dataset loaded successfully. Shape: {df.shape}")
df.head()

## 2. Data Preprocessing

In [None]:
# Selected 6 features: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, YearBuilt, Neighborhood
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
target = 'SalePrice'

# Create a copy with selected features
data = df[selected_features + [target]].copy()
print(f"Selected features: {selected_features}")
print(f"Data shape: {data.shape}")

In [None]:
# Check for missing values
print("Missing values:")
print(data[selected_features].isnull().sum())

In [None]:
# Handle missing values
# For numerical features, use median
numerical_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt']
for feature in numerical_features:
    if data[feature].isnull().any():
        data[feature].fillna(data[feature].median(), inplace=True)

# For categorical features, use mode
categorical_features = ['Neighborhood']
for feature in categorical_features:
    if data[feature].isnull().any():
        data[feature].fillna(data[feature].mode()[0], inplace=True)

print("Missing values after handling:")
print(data[selected_features].isnull().sum())

In [None]:
# Encode categorical variables
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature].astype(str))
    label_encoders[feature] = le
    print(f"Encoded {feature}: {len(le.classes_)} unique values")

# Save label encoders for later use
joblib.dump(label_encoders, 'house_price_model_encoders.pkl')
print("\nLabel encoders saved.")

In [None]:
# Separate features and target
X = data[selected_features]
y = data[target]

# Remove any remaining rows with missing values
mask = ~(X.isnull().any(axis=1) | y.isnull())
X = X[mask]
y = y[mask]

print(f"Final data shape: X={X.shape}, y={y.shape}")

## 3. Train Model

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

In [None]:
# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model trained successfully!")

## 4. Evaluate Model

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("="*50)
print("MODEL EVALUATION")
print("="*50)
print("\nTraining Set Metrics:")
print(f"  MAE (Mean Absolute Error):  ${train_mae:,.2f}")
print(f"  MSE (Mean Squared Error):   ${train_mse:,.2f}")
print(f"  RMSE (Root Mean Squared Error): ${train_rmse:,.2f}")
print(f"  R² (R-squared):             {train_r2:.4f}")

print("\nTesting Set Metrics:")
print(f"  MAE (Mean Absolute Error):  ${test_mae:,.2f}")
print(f"  MSE (Mean Squared Error):   ${test_mse:,.2f}")
print(f"  RMSE (Root Mean Squared Error): ${test_rmse:,.2f}")
print(f"  R² (R-squared):             {test_r2:.4f}")
print("="*50)

## 5. Save Model

In [None]:
# Save the trained model and encoders
joblib.dump(model, 'house_price_model.pkl')
joblib.dump(label_encoders, 'house_price_model_encoders.pkl')
print("Model saved successfully to: house_price_model.pkl")
print("Label encoders saved successfully to: house_price_model_encoders.pkl")
print("\nModel development completed! You can now use this model in the web application.")