# 🏠 House Price Prediction - Model Comparison
This notebook trains and compares multiple regression models on the housing dataset.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib

# Load dataset
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
# Preprocess the data
cat_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Separate features and target
X = df.drop("price", axis=1)
y = df["price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally scale features (can help with some models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, "scaler.joblib")


['scaler.joblib']

In [6]:
# Helper function to evaluate and save models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name} -> RMSE: {rmse:.2f}, R²: {r2:.2f}")
    joblib.dump(model, f"{name.lower().replace(' ', '_')}_model.pkl")
    return {"Model": name, "RMSE": rmse, "R2": r2}


In [8]:
# Train and evaluate multiple models
results = []

# Linear Regression
results.append(evaluate_model("Linear Regression", LinearRegression(), X_train_scaled, X_test_scaled, y_train, y_test))

# Random Forest
results.append(evaluate_model("Random Forest", RandomForestRegressor(random_state=42), X_train, X_test, y_train, y_test))

# Gradient Boosting
results.append(evaluate_model("Gradient Boosting", GradientBoostingRegressor(random_state=42), X_train, X_test, y_train, y_test))

# Optional: XGBoost (if installed)
try:
    from xgboost import XGBRegressor
    results.append(evaluate_model("XGBoost", XGBRegressor(random_state=42), X_train, X_test, y_train, y_test))
except ImportError:
    print("XGBoost not installed. Skipping...")

# Show all results
pd.DataFrame(results)


Linear Regression -> RMSE: 1324506.96, R²: 0.65
Random Forest -> RMSE: 1400565.97, R²: 0.61
Gradient Boosting -> RMSE: 1299385.98, R²: 0.67
XGBoost -> RMSE: 1448271.52, R²: 0.59


Unnamed: 0,Model,RMSE,R2
0,Linear Regression,1324507.0,0.652924
1,Random Forest,1400566.0,0.611919
2,Gradient Boosting,1299386.0,0.665965
3,XGBoost,1448272.0,0.585031
