# HDB Resale Flat Price Prediction (2017 onwards)

This notebook follows a complete ML pipeline:
- Data loading & inspection
- EDA
- Preprocessing & feature engineering
- Base models
- Hyperparameter tuning
- Final model comparison

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
# Load dataset
df = pd.read_csv("ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv")
df = df.head(1000)
df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0


In [3]:
# Basic information
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   month                1000 non-null   object 
 1   town                 1000 non-null   object 
 2   flat_type            1000 non-null   object 
 3   block                1000 non-null   object 
 4   street_name          1000 non-null   object 
 5   storey_range         1000 non-null   object 
 6   floor_area_sqm       1000 non-null   float64
 7   flat_model           1000 non-null   object 
 8   lease_commence_date  1000 non-null   int64  
 9   remaining_lease      1000 non-null   object 
 10  resale_price         1000 non-null   float64
dtypes: float64(2), int64(1), object(8)
memory usage: 86.1+ KB
None

Missing values:
month                  0
town                   0
flat_type              0
block                  0
street_name            0
storey_range        

Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price
count,1000.0,1000.0,1000.0
mean,95.27,1991.162,433647.6
std,23.843146,11.723361,134283.5
min,34.0,1966.0,205000.0
25%,73.0,1983.0,338000.0
50%,95.0,1989.0,412000.0
75%,110.0,2000.0,495000.0
max,165.0,2013.0,1108000.0


In [4]:
# Feature engineering
df['remaining_lease'] = df['remaining_lease'].str.extract(r'(\d+)').astype(int)
df['avg_storey'] = df['storey_range'].str.extract(r'(\d+)').astype(int)

# Drop columns not useful for prediction
df = df.drop(columns=['block', 'street_name', 'storey_range', 'remaining_lease'])

# Encode categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# Train-test split
X = df_encoded.drop('resale_price', axis=1)
y = df_encoded['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Base Models

In [5]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print("Linear Regression RMSE:", mean_squared_error(y_test, lr_pred))
print("Linear Regression R2:", r2_score(y_test, lr_pred))

Linear Regression RMSE: 3930201952.124735
Linear Regression R2: 0.8075296729460539


In [6]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

print("Decision Tree RMSE:", mean_squared_error(y_test, dt_pred))
print("Decision Tree R2:", r2_score(y_test, dt_pred))

Decision Tree RMSE: 6737349437.166021
Decision Tree R2: 0.6700577055723501


In [7]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("Random Forest RMSE:", mean_squared_error(y_test, rf_pred))
print("Random Forest R2:", r2_score(y_test, rf_pred))

Random Forest RMSE: 5681936954.242235
Random Forest R2: 0.7217434937938281


In [8]:
# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

print("Gradient Boosting RMSE:", mean_squared_error(y_test, gb_pred))
print("Gradient Boosting R2:", r2_score(y_test, gb_pred))

Gradient Boosting RMSE: 4932885942.583146
Gradient Boosting R2: 0.758426110857863


## Hyperparameter Tuning

In [None]:
# Random Forest Hyperparameter Tuning
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='r2')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
best_rf_pred = best_rf.predict(X_test)

print("Best RF params:", grid_rf.best_params_)
print("Tuned RF RMSE:", mean_squared_error(y_test, best_rf_pred))
print("Tuned RF R2:", r2_score(y_test, best_rf_pred))

Best RF params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Tuned RF RMSE: 4992682938.752244
Tuned RF R2: 0.7554977251032104


In [None]:
# Gradient Boosting Hyperparameter Tuning
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6]
}

grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=3, scoring='r2')
grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_
best_gb_pred = best_gb.predict(X_test)

print("Best GB params:", grid_gb.best_params_)
print("Tuned GB RMSE:", mean_squared_error(y_test, best_gb_pred))
print("Tuned GB R2:", r2_score(y_test, best_gb_pred))

Best GB params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Tuned GB RMSE: 3560804822.6590624
Tuned GB R2: 0.825619834008289


## Final Model Comparison

In [None]:
# Compare all models
models = {
    'Linear Regression': (lr_pred, mean_squared_error(y_test, lr_pred), r2_score(y_test, lr_pred)),
    'Decision Tree': (dt_pred, mean_squared_error(y_test, dt_pred), r2_score(y_test, dt_pred)),
    'Random Forest': (rf_pred, mean_squared_error(y_test, rf_pred), r2_score(y_test, rf_pred)),
    'Gradient Boosting': (gb_pred, mean_squared_error(y_test, gb_pred), r2_score(y_test, gb_pred)),
    'Tuned Random Forest': (best_rf_pred, mean_squared_error(y_test, best_rf_pred), r2_score(y_test, best_rf_pred)),
    'Tuned Gradient Boosting': (best_gb_pred, mean_squared_error(y_test, best_gb_pred), r2_score(y_test, best_gb_pred))
}

print("Model Performance Comparison:")
for name, (pred, rmse, r2) in models.items():
    print(f"{name} - RMSE: {rmse:.2f}, R2: {r2:.4f}")

Model Performance Comparison:
Linear Regression - RMSE: 3091808267.47, R2: 0.8486
Decision Tree - RMSE: 8003827304.48, R2: 0.6080
Random Forest - RMSE: 4941282914.87, R2: 0.7580
Gradient Boosting - RMSE: 4259753474.01, R2: 0.7914
Tuned Random Forest - RMSE: 4992682938.75, R2: 0.7555
Tuned Gradient Boosting - RMSE: 3560804822.66, R2: 0.8256


In [None]:
# Final Linear Regression with best features
lr_final = LinearRegression()
lr_final.fit(X_train_scaled, y_train)
lr_final_pred = lr_final.predict(X_test_scaled)

print("Final Linear Regression RMSE:", mean_squared_error(y_test, lr_final_pred))
print("Final Linear Regression R2:", r2_score(y_test, lr_final_pred))

Final Linear Regression RMSE: 3091808267.4725366
Final Linear Regression R2: 0.8485875902364706


## Conclusion

- Linear Regression provides a solid baseline
- Decision Tree shows overfitting (high variance)
- Random Forest and Gradient Boosting capture non-linear relationships
- Hyperparameter tuning improves model performance
- Key features: floor area, town, flat type, lease commence date