In [32]:
import pandas as pd
import numpy as np
import joblib

In [15]:
hdb_resale_data = pd.read_csv('data/processed/hdb_resale_data_final.csv', index_col=0)

In [16]:
hdb_resale_data.shape

(249857, 13)

In [17]:
hdb_resale_data.head()

Unnamed: 0_level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,num_mrts_within_1km,min_dist_to_mrt_km
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70,255000.0,1,0.352915
2,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65,275000.0,0,0.816023
3,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64,285000.0,1,0.229604
4,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1979,63,290000.0,1,0.664472
5,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,64,290000.0,0,0.923402


In [7]:
# Drop unnecessary columns
#hdb_resale_data = hdb_resale_data.drop(['month','block','street_name','lease_commence_date', 'num_mrts_within_1km'], axis=1, errors='ignore')

In this preprocessing step, we drop unnecessary columns. We remove lease_commence_date since its information is already reflected in remaining_lease. We also drop num_mrts_within_1km because it is less informative than min_dist_to_mrt_km. Most people, especially senior citizens, tend to care primarily about the shortest distance to the nearest MRT rather than the total number of MRT stations in the area.

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [19]:
def create_preprocessor(numerical_cols, categorical_cols):
    numerical_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    return preprocessor

<h2> Basic Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

In [21]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
y = np.log(hdb_resale_data['resale_price'])  

In [23]:
numerical_columns_linreg = hdb_resale_data.select_dtypes(include=['int64', 'float64']).columns.drop(['resale_price', 'lease_commence_date', 'num_mrts_within_1km'])
categorical_columns_linreg = hdb_resale_data.select_dtypes(include=['object', 'category']).columns.drop(['month', 'block'])

preprocessor_linreg = create_preprocessor(numerical_columns_linreg, categorical_columns_linreg)

X_linreg = hdb_resale_data.drop(['month', 'block', 'resale_price', 'lease_commence_date', 'num_mrts_within_1km'], axis=1)

X_processed_linreg = preprocessor_linreg.fit_transform(X_linreg)

linreg_model = LinearRegression()

GridSearchCV(linreg_model, param_grid=[], cv=kf, scoring='r2')

scores = cross_val_score(linreg_model, X_processed_linreg, y, cv=kf, scoring='r2') 

rmse_scores = cross_val_score(linreg_model, X_processed_linreg,  y, cv=kf,
                              scoring=make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))))

print("Average R2 score:", np.mean(scores))
print("Average RMSE:", np.mean(rmse_scores))

Average R2 score: 0.7575195793383349
Average RMSE: 0.16793482289514844


<h2> Ridge Regression Model

In [24]:
from sklearn.linear_model import Ridge

In [25]:
numerical_columns_ridge = hdb_resale_data.select_dtypes(include=['int64', 'float64']).columns.drop(['resale_price'])
categorical_columns_ridge = hdb_resale_data.select_dtypes(include=['object', 'category']).columns.drop(['month', 'block'])

preprocessor_ridge = create_preprocessor(numerical_columns_ridge, categorical_columns_ridge)

X_ridge = hdb_resale_data.drop(['month', 'block'], axis=1)

X_processed_ridge = preprocessor_ridge.fit_transform(X_ridge)

params_ridge = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_model = Ridge()

grid_ridge = GridSearchCV(ridge_model, params_ridge, cv=5, scoring='r2')
grid_ridge.fit(X_processed_ridge, y)

print("Best alpha:", grid_ridge.best_params_)
print("Best R²:", grid_ridge.best_score_)

Best alpha: {'alpha': 100}
Best R²: 0.7457609188504882


<h2> Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# ------------------------------------------------------------------------------
# Training Random Forest Regressor
# ------------------------------------------------------------------------------

'''
X_processed_rf = X_processed_linreg

params_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2]
}

rf_model = RandomForestRegressor(random_state=42)

grid_rf = GridSearchCV(
    rf_model, 
    param_grid=params_rf,
    cv=kf,
    scoring='r2',
    n_jobs=1, 
    verbose=2)

grid_rf.fit(X_processed_rf, y)

print("Best parameters:", grid_rf.best_params_)
print("Best R²:", grid_rf.best_score_)

# Save
joblib.dump(grid_rf, "models/random_forest_gridsearch.pkl")
'''

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=15.7min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=15.2min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=15.3min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=15.0min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=14.5min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=29.1min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=29.0min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=29.5min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=29.1min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=28.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=10.3min
[CV] END max_depth=10, min_sa

In [37]:
# Load Random Forest model
grid_rf = joblib.load("models/random_forest_gridsearch.pkl")
print("Best parameters:", grid_rf.best_estimator_)

Best parameters: RandomForestRegressor(n_estimators=200, random_state=42)


In [38]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

X_processed_rf = X_processed_linreg

# Instead of retraining, just grab the trained best model
best_rf = grid_rf.best_estimator_

# Split into train and test for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_rf, y, test_size=0.2, random_state=42
)

# Check R²
train_r2 = r2_score(y_train, best_rf.predict(X_train))
test_r2 = r2_score(y_test, best_rf.predict(X_test))

print(f"Train R²: {train_r2:.4f}")
print(f"Test R² : {test_r2:.4f}")

# --- Learning curve ---
train_sizes, train_scores, test_scores = learning_curve(
    best_rf, X_processed_rf, y, cv=5, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 6),
    scoring="r2"
)

# Compute mean/std
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot
plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, 'o-', label="Training R²")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2)

plt.plot(train_sizes, test_mean, 'o-', label="Validation R²")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2)

plt.xlabel("Training Set Size")
plt.ylabel("R² Score")
plt.title("Learning Curve for Random Forest (best model)")
plt.legend()
plt.grid(True)
plt.show()


Train R²: 0.9934
Test R² : 0.9932


KeyboardInterrupt: 

<h2> XGBoost Model

In [None]:
from xgboost import XGBRegressor

In [None]:
# ------------------------------------------------------------------------------
# Training Random Forest Regressor
# ------------------------------------------------------------------------------

'''
X_processed_xgb = X_processed_linreg

param_grid_xgb = {
    'n_estimators': [200, 500],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

xgb_model = XGBRegressor(random_state=42)

grid_xgb = GridSearchCV(
    xgb_model, 
    param_grid=param_grid_xgb, 
    cv=5, 
    scoring='r2', 
    n_jobs=2,
    verbose=2)

grid_xgb.fit(X_processed_xgb, y)

print("Best parameters:", grid_xgb.best_params_)
print("Best R²:", grid_xgb.best_score_)

joblib.dump(grid_xgb, "models/xgb_gridsearch.pkl")
'''

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 500, 'subsample': 0.8}
Best R²: 0.8005152323989645


In [42]:
# Load XGB model
grid_xgb = joblib.load("models/xgb_gridsearch.pkl")
print("Best parameters:", grid_xgb.best_estimator_)

Best parameters: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=8,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=500,
             n_jobs=None, num_parallel_tree=None, ...)
