# House Price Prediction Model Development

This notebook builds a Random Forest Regressor model to predict house prices using selected features from the House Prices dataset.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os
import warnings

warnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('house_prices_train.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())

## 3. Data Preprocessing

### 3.1 Select Features
Selected 6 features from the recommended 9: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, FullBath, YearBuilt

In [None]:
# Feature selection - 6 features from the recommended 9
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'FullBath', 'YearBuilt']
target = 'SalePrice'

print(f"Selected features: {selected_features}")
print(f"Target: {target}")

# Check for missing values
print(f"\nMissing values before handling:")
print(df[selected_features + [target]].isnull().sum())

### 3.2 Handle Missing Values

In [None]:
# Create a copy of the dataset
df_processed = df[selected_features + [target]].copy()

# Handle missing values - drop rows with any missing values in selected features or target
df_processed = df_processed.dropna()

print(f"Dataset shape after removing missing values: {df_processed.shape}")
print(f"\nMissing values after handling:")
print(df_processed.isnull().sum())

### 3.3 Separate Features and Target

In [None]:
# Separate features and target
X = df_processed[selected_features]
y = df_processed[target]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nTarget statistics:")
print(y.describe())

### 3.4 Train/Test Split (BEFORE SCALING - CRITICAL TO AVOID DATA LEAKAGE)

In [None]:
# CRITICAL: Split BEFORE scaling to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

### 3.5 Feature Scaling (ONLY on training data)

In [None]:
# Initialize and fit scaler ONLY on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scaler to test data (transform, NOT fit_transform)
X_test_scaled = scaler.transform(X_test)

print(f"Scaler fitted on training data")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"\nScaler mean (fitted on training): {scaler.mean_}")
print(f"Scaler scale (fitted on training): {scaler.scale_}")

## 4. Model Training

In [None]:
# Initialize Random Forest Regressor
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

print("Model training completed!")

## 5. Model Evaluation

In [None]:
# Make predictions on scaled test data
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate regression metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("="*60)
print("MODEL EVALUATION METRICS")
print("="*60)
print("\nTRAINING METRICS:")
print(f"  MAE:  ${train_mae:,.2f}")
print(f"  MSE:  ${train_mse:,.2f}")
print(f"  RMSE: ${train_rmse:,.2f}")
print(f"  R²:   {train_r2:.4f}")

print("\nTEST METRICS:")
print(f"  MAE:  ${test_mae:,.2f}")
print(f"  MSE:  ${test_mse:,.2f}")
print(f"  RMSE: ${test_rmse:,.2f}")
print(f"  R²:   {test_r2:.4f}")
print("="*60)

### Feature Importance

In [None]:
# Display feature importance
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFEATURE IMPORTANCE:")
print(feature_importance.to_string(index=False))

## 6. Save Model and Artifacts

In [None]:
# Create model directory if it doesn't exist
model_dir = './'
os.makedirs(model_dir, exist_ok=True)

# Save model using joblib
model_path = os.path.join(model_dir, 'house_price_model.pkl')
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Save scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# Save selected features list
features_path = os.path.join(model_dir, 'selected_features.pkl')
joblib.dump(selected_features, features_path)
print(f"Selected features saved to: {features_path}")

## 7. Verify Model Reloading

In [None]:
# Reload the model to ensure it was saved correctly
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_features = joblib.load(features_path)

# Make predictions with reloaded model
y_test_pred_reloaded = loaded_model.predict(loaded_scaler.transform(X_test))
reloaded_r2 = r2_score(y_test, y_test_pred_reloaded)

print("Model reloading verification:")
print(f"  Model reloaded successfully: {loaded_model is not None}")
print(f"  Scaler reloaded successfully: {loaded_scaler is not None}")
print(f"  Features reloaded successfully: {loaded_features == selected_features}")
print(f"  R² score with reloaded model: {reloaded_r2:.4f}")
print(f"  Matches original R²: {abs(reloaded_r2 - test_r2) < 1e-6}")