# HDB Resale Flat Price Prediction (2017 onwards)

This notebook follows a complete ML pipeline:
- Data loading & inspection
- EDA
- Preprocessing & feature engineering
- Base models
- Hyperparameter tuning
- Final model comparison

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Load dataset
df = pd.read_csv("ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv")
df = df.head(1000)
df.head()

In [None]:
# Basic information
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe()

In [None]:
# Feature engineering
df['remaining_lease'] = df['remaining_lease'].str.extract(r'(\d+)').astype(int)
df['avg_storey'] = df['storey_range'].str.extract(r'(\d+)').astype(int)

# Drop columns not useful for prediction
df = df.drop(columns=['block', 'street_name', 'storey_range', 'remaining_lease'])

# Encode categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# Train-test split
X = df_encoded.drop('resale_price', axis=1)
y = df_encoded['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Base Models

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)

print("Linear Regression RMSE:", mean_squared_error(y_test, lr_pred))
print("Linear Regression R2:", r2_score(y_test, lr_pred))

In [None]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

print("Decision Tree RMSE:", mean_squared_error(y_test, dt_pred))
print("Decision Tree R2:", r2_score(y_test, dt_pred))

In [None]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("Random Forest RMSE:", mean_squared_error(y_test, rf_pred))
print("Random Forest R2:", r2_score(y_test, rf_pred))

In [None]:
# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

print("Gradient Boosting RMSE:", mean_squared_error(y_test, gb_pred))
print("Gradient Boosting R2:", r2_score(y_test, gb_pred))

## Hyperparameter Tuning

In [None]:
# Random Forest Hyperparameter Tuning
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='r2')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
best_rf_pred = best_rf.predict(X_test)

print("Best RF params:", grid_rf.best_params_)
print("Tuned RF RMSE:", mean_squared_error(y_test, best_rf_pred))
print("Tuned RF R2:", r2_score(y_test, best_rf_pred))

In [None]:
# Gradient Boosting Hyperparameter Tuning
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6]
}

grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=3, scoring='r2')
grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_
best_gb_pred = best_gb.predict(X_test)

print("Best GB params:", grid_gb.best_params_)
print("Tuned GB RMSE:", mean_squared_error(y_test, best_gb_pred))
print("Tuned GB R2:", r2_score(y_test, best_gb_pred))

## Final Model Comparison

In [None]:
# Compare all models
models = {
    'Linear Regression': (lr_pred, mean_squared_error(y_test, lr_pred), r2_score(y_test, lr_pred)),
    'Decision Tree': (dt_pred, mean_squared_error(y_test, dt_pred), r2_score(y_test, dt_pred)),
    'Random Forest': (rf_pred, mean_squared_error(y_test, rf_pred), r2_score(y_test, rf_pred)),
    'Gradient Boosting': (gb_pred, mean_squared_error(y_test, gb_pred), r2_score(y_test, gb_pred)),
    'Tuned Random Forest': (best_rf_pred, mean_squared_error(y_test, best_rf_pred), r2_score(y_test, best_rf_pred)),
    'Tuned Gradient Boosting': (best_gb_pred, mean_squared_error(y_test, best_gb_pred), r2_score(y_test, best_gb_pred))
}

print("Model Performance Comparison:")
for name, (pred, rmse, r2) in models.items():
    print(f"{name} - RMSE: {rmse:.2f}, R2: {r2:.4f}")

In [None]:
# Final Linear Regression with scaled features
lr_final = LinearRegression()
lr_final.fit(X_train_scaled, y_train)
lr_final_pred = lr_final.predict(X_test_scaled)

print("Final Linear Regression RMSE:", mean_squared_error(y_test, lr_final_pred))
print("Final Linear Regression R2:", r2_score(y_test, lr_final_pred))

## Conclusion

- Linear Regression provides a solid baseline
- Decision Tree shows overfitting (high variance)
- Random Forest and Gradient Boosting capture non-linear relationships
- Hyperparameter tuning improves model performance
- Key features: floor area, town, flat type, lease commence date