<a href="https://colab.research.google.com/github/Biscuitkru/CS5228_Project/blob/main/CS5228_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade -q numpy
!pip install --upgrade -q scikit-learn
!pip install --upgrade -q xgboost
!pip install -q tqdm
!pip install -q joblib

In [None]:
import os, re, math, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import math
import json

from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, Dict, List, Any
from collections import Counter

# for modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from tqdm.auto import tqdm
from lightgbm import LGBMRegressor

import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
########################################
# Configs
########################################

# Main
train_path = "/content/drive/MyDrive/CS Masters/CS5228/train.csv"
test_path = "/content/drive/MyDrive/CS Masters/CS5228/test.csv"

# Auxiliary data
hdb_data_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-hdb-block-details.csv"
gov_hawkers_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-gov-hawkers.csv"
mrt_stations_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-mrt-stations.csv"
pri_schools_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-primary-schools.csv"
sec_schools_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-secondary-schools.csv"
shopping_malls_path = "/content/drive/MyDrive/CS Masters/CS5228/auxiliary-data/sg-shopping-malls.csv"

# Modeling controls
seed = 777
rng = np.random.default_rng(seed)

In [None]:
################################
# configs for local
################################

# local_path = r"C:\Users\admin\Desktop\school\MComp_AI\CS5228_Knowledge_Discovery_and_Data_Mining\Project\cs5228\CS5228_Project\data"


# # Main
# train_path = local_path + "/train.csv"
# test_path = local_path + "/test.csv"

# # Auxiliary data
# hdb_data_path = local_path + "/auxiliary-data/sg-hdb-block-details.csv"
# gov_hawkers_path = local_path + "/auxiliary-data/sg-gov-hawkers.csv"
# mrt_stations_path = local_path + "/auxiliary-data/sg-mrt-stations.csv"
# pri_schools_path = local_path + "/auxiliary-data/sg-primary-schools.csv"
# sec_schools_path = local_path + "/auxiliary-data/sg-secondary-schools.csv"
# shopping_malls_path = local_path + "/auxiliary-data/sg-shopping-malls.csv"

# # Modeling controls
# seed = 777
# np.random.seed(seed)

## Data Preprocessing

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()

In [None]:
train_df.describe() # No NA values at all

Unnamed: 0,floor_area_sqm,lease_commence_data,resale_price
count,162691.0,162691.0,162691.0
mean,96.89125,1996.358993,518843.0
std,24.030547,14.24592,183244.2
min,31.0,1966.0,150000.0
25%,82.0,1985.0,382000.0
50%,93.0,1996.0,488000.0
75%,112.0,2011.0,622000.0
max,366.7,2022.0,1658888.0


In [None]:
# Date Manipulation
def process_month_column(df):
    df['month'] = pd.to_datetime(df['month'], format="%Y-%m")
    df['sale_year'] = df['month'].dt.year
    df['sale_month'] = df['month'].dt.month
    # sale_quarter as a feature to capture potential seasonality in resale prices.
    df['sale_quarter'] = df['month'].dt.quarter
    return df

train_df = process_month_column(train_df)
test_df = process_month_column(test_df)

# additional month_index to capture trend
abs_month_train = train_df["sale_year"]*12 + train_df["sale_month"]
abs_month_test  = test_df["sale_year"]*12 + test_df["sale_month"]
train_df["month_index"] = (abs_month_train - abs_month_train.min()).astype(float)
test_df["month_index"]  = (abs_month_test  - abs_month_train.min()).astype(float)

In [None]:
# To remove trailing whitespace and reduce multiple spaces to a single space
def norm_text_basic(s):
    if not isinstance(s, str):
        return s
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

# Normalise + uppercase, mainly for the str values
def norm_key(s, case="upper"):
    s = norm_text_basic(s)
    if isinstance(s, str):
        return s.upper() if case == "upper" else s.casefold()
    return s

In [None]:
for col in ["town","flat_type", "block", "street", "flat_model"]:
    if col in train_df.columns: train_df[col] = train_df[col].map(norm_key)
    if col in test_df.columns: test_df[col]  = test_df[col].map(norm_key)

In [None]:
# Standardize flat type
def standardise_flat_type(val: str) -> str:
    v = val.strip().replace("_", " ").replace("-", " ").replace("  ", " ")
    v = v.replace("ROOM", " ROOM")
    v = re.sub(r"\s+", " ", v).strip()
    return v

In [None]:
train_df['flat_type'] = train_df['flat_type'].apply(standardise_flat_type)
test_df['flat_type'] = test_df['flat_type'].apply(standardise_flat_type)
print(train_df['flat_type'].unique())

['4 ROOM' '5 ROOM' '3 ROOM' 'EXECUTIVE' '2 ROOM' '1 ROOM'
 'MULTI GENERATION']


In [None]:
# General Price hierarchy for HDB flats
flat_type_order = ["1 ROOM","2 ROOM","3 ROOM","4 ROOM","5 ROOM","MULTI GENERATION","EXECUTIVE"]
order_map = {ft:i for i, ft in enumerate(flat_type_order)}

train_df["flat_type_price_hier"] = train_df["flat_type"].map(lambda x: order_map.get(str(x).upper(), np.nan))
test_df["flat_type_price_hier"]  = test_df["flat_type"].map(lambda x: order_map.get(str(x).upper(), np.nan))

# For Decision trees
for col, val in [("is_exec","EXECUTIVE"), ("is_multigen","MULTI GENERATION")]:
    train_df[col] = (train_df["flat_type"].str.upper() == val).astype(int)
    test_df[col]  = (test_df["flat_type"].str.upper() == val).astype(int)

In [None]:
# No. of Bedrooms
BEDROOMS_MAP = {"1 ROOM":0,"2 ROOM":1,"3 ROOM":2,"4 ROOM":3,"5 ROOM":4,"EXECUTIVE":3,"MULTI GENERATION":4}
train_df["bedrooms_est"] = train_df["flat_type"].str.upper().map(BEDROOMS_MAP)
test_df["bedrooms_est"]  = test_df["flat_type"].str.upper().map(BEDROOMS_MAP)

In [None]:
# Floor Range, just going to set as the middle numnber
def parse_floor_mid(floor_range) -> float:
    if isinstance(floor_range, (float)):
        return float(floor_range)
    m = re.findall(r"\d{1,2}", floor_range)
    if len(m) >= 2:
        lo, hi = int(m[0]), int(m[1])
        if lo <= hi:
            return (lo + hi) / 2.0
    return np.nan

In [None]:
train_df['floor_range'] = train_df['floor_range'].apply(parse_floor_mid).astype(float)
train_df = train_df.rename(columns={"floor_range": "floor_mid"})

test_df['floor_range'] = test_df['floor_range'].apply(parse_floor_mid).astype(float)
test_df = test_df.rename(columns={"floor_range": "floor_mid"})

print(np.sort(train_df['floor_mid'].unique()))
print(np.sort(test_df['floor_mid'].unique()))

[ 2.  5.  8. 11. 14. 17. 20. 23. 26. 29. 32. 35. 38. 41. 44. 47. 50.]
[ 2.  5.  8. 11. 14. 17. 20. 23. 26. 29. 32. 35. 38. 41. 44. 47. 50.]


In [None]:
print(train_df['eco_category'].unique())
print(test_df['eco_category'].unique())

# Current dataset is taken directly from the Kaggle scoreboard which is missing eco_category (Might need to double check whether this is true)
train_df = train_df.drop('eco_category', axis=1)
test_df = test_df.drop('eco_category', axis=1)

['uncategorized']
['uncategorized']


In [None]:
# Flat age, years left from 99
train_df = train_df.rename(columns={"lease_commence_data": "lease_commence_date"})
test_df = test_df.rename(columns={"lease_commence_data": "lease_commence_date"})

train_df["lease_left"] = 99 - (train_df["sale_year"] - train_df["lease_commence_date"])
test_df["lease_left"] = 99 - (test_df["sale_year"] - test_df["lease_commence_date"])

In [None]:
# HDB price drops faster as the remaining lease falls—use non-linear transforms and an approximate leasehold-relativity curve (aka “Bala”).
# URA’s table pegs 99y ≈ 96%, 60y ≈ 80%, 30y ≈ 60% of freehold, https://www.ura.gov.sg/-/media/Corporate/Guidelines/Development-control/Circulars/2022/Jul/dc22-08-Appendix2.pdf
# A convex decay so a curved transform helps models learn this faster than a raw linear lease_left

for df in (train_df, test_df):
    df["lease_left_sq"]  = df["lease_left"]**2
    df["lease_left_log"] = np.log1p(df["lease_left"].clip(lower=0))

    # Bala-style approximation using anchor points (convex decay)
    anchors_years = np.array([30, 60, 99], dtype=float)
    anchors_rel   = np.array([0.60, 0.80, 0.96], dtype=float)
    df["lease_rel_approx"] = np.interp(df["lease_left"].clip(0, 99), anchors_years, anchors_rel)

In [None]:
# Loading Aux Data
aux_dict = {
    "hdb": pd.read_csv(hdb_data_path),
    "mrt": pd.read_csv(mrt_stations_path),
    "pri": pd.read_csv(pri_schools_path),
    "sec": pd.read_csv(sec_schools_path),
    "malls": pd.read_csv(shopping_malls_path),
    "hawkers": pd.read_csv(gov_hawkers_path)
}

In [None]:
# HDB Data
hdb = aux_dict["hdb"].copy()

hdb.columns = hdb.columns.str.lower()

for col in ["town","block", "address", "subzone", "planning_area", "region"]:
    if col in hdb.columns: hdb[col] = hdb[col].map(norm_key)

In [None]:
hdb_pairs = hdb[["town","block"]].drop_duplicates()
train_pairs = train_df[["town","block"]]
test_pairs  = test_df[["town","block"]]

In [None]:
train_cov = train_pairs.merge(hdb_pairs, on=["town","block"], how="left", indicator=True)
test_cov = test_pairs.merge(hdb_pairs,  on=["town","block"], how="left", indicator=True)

train_match_n = (train_cov["_merge"] == "both").sum()
test_match_n = (test_cov["_merge"] == "both").sum()

print(f"[COVERAGE] Train matches: {train_match_n}/{len(train_cov)} = {train_match_n/len(train_cov)}")
print(f"[COVERAGE] TEST matches: {test_match_n}/{len(test_cov)} = {test_match_n/len(test_cov)}")

[COVERAGE] Train matches: 162691/162691 = 1.0
[COVERAGE] TEST matches: 50000/50000 = 1.0


In [None]:
hdb_unique = hdb.drop_duplicates(subset=["town","block"])

cols_to_add = ["latitude","longitude","max_floor","subzone","planning_area","region"]
cols_to_add = [c for c in cols_to_add if c in hdb_unique.columns]  # guard

train_df = train_df.merge(hdb_unique[["town","block", *cols_to_add]], on=["town","block"], how="left")
test_df = test_df.merge (hdb_unique[["town","block", *cols_to_add]], on=["town","block"], how="left")

In [None]:
# Floor effects (relative position in block)
# Higher floors typically command a premium, and “how high” depends on the block’s max floor
# Relative floor and an “is high floor” flag. https://ideas.repec.org/a/taf/apeclt/v26y2019i6p436-439.html

for df in (train_df, test_df):
    df["rel_floor"] = (df["floor_mid"] / df["max_floor"]).replace([np.inf, -np.inf], np.nan)
    df["is_high_floor"] = (df["rel_floor"] >= 0.7).astype(int)

In [None]:
# MRT Data
mrt = aux_dict["mrt"].copy()

mrt.columns = mrt.columns.str.lower()

for col in ["code","name", "status", "subzone", "planning_area", "region"]:
    if col in mrt.columns: mrt[col] = mrt[col].map(norm_key)

In [None]:
print(mrt['status'].unique())
# Open and Planned MRT proximity features

from sklearn.neighbors import BallTree

mrt_open = mrt[mrt["status"] == "OPEN"].copy()
mrt_planned = mrt[mrt["status"] == "PLANNED"].copy()

R = 6371000.0
def to_rad(df): return np.deg2rad(df[["latitude", "longitude"]].to_numpy(dtype=float))

tree_open = BallTree(to_rad(mrt_open), metric="haversine")
tree_planned = BallTree(to_rad(mrt_planned), metric="haversine")

['OPEN' 'PLANNED']


In [None]:
def add_mrt_features(df, radii_open=(500,1000,2000), radii_plan=(500,1000,2000)):
    idx = df[["latitude","longitude"]].dropna().index
    X = np.deg2rad(df.loc[idx, ["latitude","longitude"]].to_numpy(dtype=float))

    # OPEN: nearest + counts
    if tree_open is not None and len(idx):
        dist, _ = tree_open.query(X, k=1)
        df.loc[idx, "mrt_open_nearest_m"] = dist[:,0] * R
        for r in radii_open:
            df.loc[idx, f"mrt_open_within_{r}m"] = tree_open.query_radius(X, r/R, count_only=True)
    else:
        df["mrt_open_nearest_m"] = np.nan
        for r in radii_open: df[f"mrt_open_within_{r}m"] = 0

    # PLANNED: nearest + counts
    if tree_planned is not None and len(idx):
        dist, _ = tree_planned.query(X, k=1)
        df.loc[idx, "mrt_plan_nearest_m"] = dist[:,0] * R
        for r in radii_plan:
            df.loc[idx, f"mrt_plan_within_{r}m"] = tree_planned.query_radius(X, r/R, count_only=True)
    else:
        df["mrt_plan_nearest_m"] = np.nan
        for r in radii_plan: df[f"mrt_plan_within_{r}m"] = 0

    # Comparative signals
    df["mrt_any_nearest_m"] = np.nanmin(
        np.vstack([df["mrt_open_nearest_m"].to_numpy(dtype=float),
                   df["mrt_plan_nearest_m"].to_numpy(dtype=float)]),
        axis=0
    )
    df["mrt_plan_closer_than_open"] = (
        (df["mrt_plan_nearest_m"] < df["mrt_open_nearest_m"]).astype("Int64")
    )

    # fill NA counts for rows without coords
    for c in [col for col in df.columns if col.startswith(("mrt_open_within_","mrt_plan_within_"))]:
        df[c] = df[c].fillna(0).astype(int)

    return df

train_df = add_mrt_features(train_df)
test_df = add_mrt_features(test_df)

In [None]:
# School Data
pri = aux_dict["pri"].copy()
sec = aux_dict["sec"].copy()

pri.columns = pri.columns.str.lower()
sec.columns = sec.columns.str.lower()

for col in ["name","street", "subzone", "planning_area", "region"]:
    if col in pri.columns: pri[col] = pri[col].map(norm_key)
    if col in sec.columns: sec[col] = sec[col].map(norm_key)

In [None]:
# Distance to school
# MOE’s P1 registration use home-school distance categories of <1 km, 1–2 km, >2 km for priority, so those should capture a policy-driven price signal.
# For secondary school I'm not sure, maybe if they still live there a sec school nearby is a boon?

def add_proximity_to_school(df, tree, prefix, radii=(1000, 2000)):
    idx = df[["latitude","longitude"]].dropna().index
    X = np.deg2rad(df.loc[idx, ["latitude","longitude"]].to_numpy(dtype=float))
    # nearest distance (meters)
    dist, _ = tree.query(X, k=1)
    df.loc[idx, f"{prefix}_nearest_m"] = dist[:,0] * R
    # counts within radii (meters)
    for r in radii:
        cnt = tree.query_radius(X, r / R, count_only=True)
        df.loc[idx, f"{prefix}_within_{r}m"] = cnt
    # fill rows without coords
    for r in radii:
        df[f"{prefix}_within_{r}m"] = df.get(f"{prefix}_within_{r}m", 0).fillna(0).astype(int)
    return df

In [None]:
# Primary schools
tree_pri = BallTree(to_rad(pri), metric="haversine")
train_df = add_proximity_to_school(train_df, tree_pri, "pri", radii=(1000, 2000))
test_df = add_proximity_to_school(test_df, tree_pri, "pri", radii=(1000, 2000))

# Secondary schools
tree_sec = BallTree(to_rad(sec), metric="haversine")
train_df = add_proximity_to_school(train_df, tree_sec, "sec", radii=(1000, 2000))
test_df = add_proximity_to_school(test_df, tree_sec, "sec", radii=(1000, 2000))

In [None]:
mall = aux_dict["malls"].copy()
mall.columns = mall.columns.str.lower()
for col in ["name", "street", "postal_code", "latitude", "longitude", "subzone", "planning_area", "region"]:
    if col in pri.columns: pri[col] = pri[col].map(norm_key)

In [None]:
def add_proximity_to_mall(df, tree, prefix, radii=(1000, 2000)):
    idx = df[["latitude","longitude"]].dropna().index
    X = np.deg2rad(df.loc[idx, ["latitude","longitude"]].to_numpy(dtype=float))
    # nearest distance (meters)
    dist, _ = tree.query(X, k=1)
    df.loc[idx, f"{prefix}_nearest_m"] = dist[:,0] * R
    # counts within radii (meters)
    for r in radii:
        cnt = tree.query_radius(X, r / R, count_only=True)
        df.loc[idx, f"{prefix}_within_{r}m"] = cnt
    # fill rows without coords
    for r in radii:
        df[f"{prefix}_within_{r}m"] = df.get(f"{prefix}_within_{r}m", 0).fillna(0).astype(int)
    return df

In [None]:
tree_mall = BallTree(to_rad(mall), metric="haversine")
train_df = add_proximity_to_mall(train_df, tree_mall, "mall", radii=(1000, 2000))
test_df = add_proximity_to_mall(test_df, tree_mall, "mall", radii=(1000, 2000))

In [None]:
hawker = aux_dict["hawkers"].copy()
hawker.columns = hawker.columns.str.lower()

In [None]:
def add_proximity_to_hawker(df, tree, prefix, radii=(500, 1000)):
    idx = df[["latitude","longitude"]].dropna().index
    # nothing to do if no points with coords
    if len(idx) == 0:
        # ensure columns exist for downstream code
        for r in radii:
            df[f"{prefix}_within_{r}m"] = df.get(f"{prefix}_within_{r}m", 0).fillna(0).astype(int)
            df[f"{prefix}_stalls_within_{r}m"] = df.get(f"{prefix}_stalls_within_{r}m", 0).fillna(0).astype(int)
        df[f"{prefix}_nearest_m"] = np.nan
        return df

    X = np.deg2rad(df.loc[idx, ["latitude","longitude"]].to_numpy(dtype=float))
    # nearest distance (meters)
    dist, _ = tree.query(X, k=1)
    df.loc[idx, f"{prefix}_nearest_m"] = dist[:,0] * R

    # counts within radii (meters) and sum of stalls within radii
    stalls_col = 'number_of_stalls'
    for r in radii:
        inds = tree.query_radius(X, r / R)
        cnt = np.array([len(a) for a in inds])
        df.loc[idx, f"{prefix}_within_{r}m"] = cnt
        stalls_sum = np.array([hawker.iloc[a][stalls_col].sum() if len(a) > 0 else 0 for a in inds])
        df.loc[idx, f"{prefix}_stalls_within_{r}m"] = stalls_sum

    # fill rows without coords and ensure integer types for counts
    for r in radii:
        df[f"{prefix}_within_{r}m"] = df.get(f"{prefix}_within_{r}m", 0).fillna(0).astype(int)
        df[f"{prefix}_stalls_within_{r}m"] = df.get(f"{prefix}_stalls_within_{r}m", 0).fillna(0).astype(int)
    return df

In [None]:
tree_hawker = BallTree(to_rad(hawker), metric="haversine")
train_df = add_proximity_to_hawker(train_df, tree_hawker, "hawker", radii=(500, 1000))
test_df = add_proximity_to_hawker(test_df, tree_hawker, "hawker", radii=(500, 1000))

In [None]:
train_df.groupby("hawker_within_500m").size()

Unnamed: 0_level_0,0
hawker_within_500m,Unnamed: 1_level_1
0,117100
1,29737
2,12248
3,2956
4,582
5,68


### lets add age using month

In [None]:
def add_flat_age_days(train_df, test_df):

    train_df['lease_commence_dt'] = pd.to_datetime(train_df['lease_commence_date'].astype(str) + '-01-01')
    test_df['lease_commence_dt'] = pd.to_datetime(test_df['lease_commence_date'].astype(str) + '-01-01')

    train_df['flat_age_days'] = (train_df['month'] - train_df['lease_commence_dt']).dt.days
    test_df['flat_age_days'] = (test_df['month'] - test_df['lease_commence_dt']).dt.days

    train_df = train_df.drop(columns=['lease_commence_dt'])
    test_df = test_df.drop(columns=['lease_commence_dt'])

    return train_df, test_df

train_df, test_df = add_flat_age_days(train_df, test_df)
train_df

Unnamed: 0,month,town,flat_type,block,street,floor_mid,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,sec_within_2000m,mall_nearest_m,mall_within_1000m,mall_within_2000m,hawker_nearest_m,hawker_within_500m,hawker_stalls_within_500m,hawker_within_1000m,hawker_stalls_within_1000m,flat_age_days
0,2020-10-01,WOODLANDS,4 ROOM,681B,WOODLANDS DRIVE 62,8.0,102.0,PREMIUM APARTMENT,2000,420000.0,...,8,1994.701707,0,1,2683.955240,0,0,0,0,7579
1,2021-07-01,BISHAN,4 ROOM,264,BISHAN STREET 24,8.0,104.0,MODEL A,1992,585000.0,...,11,1176.138670,0,3,916.936357,0,0,4,340,10774
2,2021-05-01,BUKIT PANJANG,4 ROOM,520,JELAPANG ROAD,20.0,102.0,MODEL A,1998,450000.0,...,5,980.543397,1,1,5364.143377,0,0,0,0,8521
3,2021-08-01,PUNGGOL,4 ROOM,121B,EDGEDALE PLAINS,17.0,93.0,MODEL A,2017,465000.0,...,7,463.143097,1,4,4563.650903,0,0,0,0,1673
4,2023-05-01,HOUGANG,5 ROOM,997B,BUANGKOK CRESCENT,11.0,113.0,IMPROVED,2018,710000.0,...,7,851.672489,1,2,2921.355463,0,0,0,0,1946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162686,2017-07-01,HOUGANG,4 ROOM,708,HOUGANG AVENUE 2,2.0,91.0,NEW GENERATION,1985,335000.0,...,10,828.902514,2,2,911.095177,0,0,1,186,11869
162687,2020-09-01,PASIR RIS,4 ROOM,634,PASIR RIS DRIVE 1,5.0,104.0,MODEL A,1995,388000.0,...,1,1670.498928,0,1,3608.220549,0,0,0,0,9375
162688,2017-10-01,GEYLANG,4 ROOM,319,UBI AVENUE 1,11.0,84.0,SIMPLIFIED,1985,373000.0,...,3,2534.264925,0,0,886.171063,0,0,1,110,11961
162689,2020-08-01,SENGKANG,5 ROOM,290B,COMPASSVALE CRESCENT,2.0,110.0,IMPROVED,2001,420000.0,...,9,594.746802,1,4,4392.346046,0,0,0,0,7152


In [None]:
# One-hot for coarse region, and target/mean encoding for higher-cardinality planning_area/subzone

# Region one-hot
region_dum = pd.get_dummies(train_df["region"], prefix="region", dummy_na=False)
train_df = pd.concat([train_df, region_dum], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df["region"], prefix="region", dummy_na=False)], axis=1)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)  # align after dummies

# KFold mean encoding for planning_area
from sklearn.model_selection import KFold
def kfold_target_mean_encode(tr, te, col, y="resale_price", n_splits=5, seed=777):
    tr = tr.copy(); te = te.copy()
    global_mean = tr[y].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    tr_enc = pd.Series(index=tr.index, dtype=float)
    for tr_idx, val_idx in kf.split(tr):
        m = tr.iloc[tr_idx].groupby(col)[y].mean()
        tr_enc.iloc[val_idx] = tr.iloc[val_idx][col].map(m).fillna(global_mean)
    te_enc = te[col].map(tr.groupby(col)[y].mean()).fillna(global_mean)
    tr[f"{col}_te"] = tr_enc
    te[f"{col}_te"] = te_enc
    return tr, te

train_df, test_df = kfold_target_mean_encode(train_df, test_df, "planning_area")
train_df, test_df = kfold_target_mean_encode(train_df, test_df, "subzone")

In [None]:
# Improving upon flat-type price hierarchy

# Hedonic housing interaction
for df in (train_df, test_df):
    df["area_centered"] = df["floor_area_sqm"] - A_mean
    df["exec_x_area"] = df["is_exec"] * df["area_centered"]
    df["multigen_x_area"] = df["is_multigen"] * df["area_centered"]

# coarse one-hot for model (limit top N categories to avoid sparsity)
top_models = train_df["flat_model"].value_counts().head(8).index
for df in (train_df, test_df):
    df["flat_model_top"] = df["flat_model"].where(df["flat_model"].isin(top_models), "OTHER")
model_dum = pd.get_dummies(train_df["flat_model_top"], prefix="model")
train_df = pd.concat([train_df, model_dum], axis=1)
test_df  = pd.concat([test_df, pd.get_dummies(test_df["flat_model_top"], prefix="model")], axis=1)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)


### baseline models

linear regression

In [None]:
target = 'resale_price'

# Define columns that should NOT be used as features
# block and street are in lat lon
# month and lease_commence_date are in flat_age_days
drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']


numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

# Create a validation split (80% train, 20% validation)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)
val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

# retraining on all data
model_pipeline.fit(X, y)

# Save the entire pipeline (preprocessor + model) to a file
model_filename = 'linear_regression.joblib'
joblib.dump(model_pipeline, model_filename)


Validation RMSE: 56649.0188


polynomial degree 2 (stop at 2 because this takes 40+mins already)

In [None]:
target = 'resale_price'

drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)), # Apply polynomial transform
    ('model', LinearRegression())
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)

val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

model_pipeline.fit(X, y)

model_filename = 'polynomial_regression.joblib'
joblib.dump(model_pipeline, model_filename)


Validation RMSE: 32251.4299


dt regression

In [None]:
target = 'resale_price'

drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42)) # Apply Decision Tree model
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)

val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

model_pipeline.fit(X, y)

model_filename = 'decision_tree.joblib'
joblib.dump(model_pipeline, model_filename)

Validation RMSE: 42299.4341


bagging

In [None]:
target = 'resale_price'

drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', BaggingRegressor(random_state=42)) # Apply Bagging Regression model
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)

val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

model_pipeline.fit(X, y)

model_filename = 'bagging.joblib'
joblib.dump(model_pipeline, model_filename)

Validation RMSE: 31659.2727


random forest

In [None]:
target = 'resale_price'

drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42)) # Apply Random Forest Regression model
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)

val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

model_pipeline.fit(X, y)

model_filename = 'random_forest.joblib'
joblib.dump(model_pipeline, model_filename)

# Validation RMSE: 29868.7537
# ['random_forest.joblib']

lightgbm

In [None]:
target = 'resale_price'

drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

numerical_features = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in drop_cols
]

categorical_features = [
    col for col in train_df.select_dtypes(include=['object', 'category']).columns
    if col not in drop_cols
]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())                    # Normalize data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(random_state=42)) # Apply LightGBM Regression model
])

X = train_df.drop(columns=drop_cols)
y = train_df[target]

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train_split, y_train_split)

val_predictions = model_pipeline.predict(X_val)

# Calculate RMSE
validation_mse = mean_squared_error(y_val, val_predictions)
validation_rmse = np.sqrt(validation_mse)

print(f"Validation RMSE: {validation_rmse:.4f}")

model_pipeline.fit(X, y)

model_filename = 'lightgbm.joblib'
joblib.dump(model_pipeline, model_filename)

Output from LGBM

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3777
[LightGBM] [Info] Number of data points in the train set: 130152, number of used features: 277
[LightGBM] [Info] Start training from score 518727.087977
c:\Users\admin\Desktop\school\MComp_AI\CS5228_Knowledge_Discovery_and_Data_Mining\.venv\lib\site-packages\sklearn\utils\validation.py:2739: UserWarning: X does not have valid feature names, but LGBMRegressor was fitted with feature names
  warnings.warn(
Validation RMSE: 34479.3284
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3784
[LightGBM] [Info] Number of data points in the train set: 162691, number of used features: 280
[LightGBM] [Info] Start training from score 518843.001477
['lightgbm.joblib']

XGBoost

In [None]:
def train_xgboost_model(
    train_df,
    target: str = "resale_price",
    drop_cols: Optional[List[str]] = None,
    numerical_features: Optional[List[str]] = None,
    categorical_features: Optional[List[str]] = None,
    val_size: float = 0.2,
    random_state: int = 777,
    use_gpu: bool = True,
    n_trials: int = 30,
    early_stopping_rounds: int = 200,
    max_boost_rounds: int = 5000,
    save_dir: Optional[str] = None,
) -> Dict[str, Any]:


    # Column prep
    drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']

    X = train_df.drop(columns=drop_cols, errors="ignore")
    y = train_df[target].to_numpy()

    if numerical_features is None:
        numerical_features = list(X.select_dtypes(include=np.number).columns)
    if categorical_features is None:
        categorical_features = list(X.select_dtypes(include=["object", "category"]).columns)

    if not numerical_features and not categorical_features:
        raise ValueError("No features found. Check drop_cols and dtypes.")

    # Preprocessor
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), numerical_features),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe)]), categorical_features),
        ],
        remainder="drop",
    )

    X_train, X_val, y_train_raw, y_val_raw = train_test_split(
        X, y, test_size=val_size, random_state=random_state
    )

    X_train_t = preprocessor.fit_transform(X_train)
    X_val_t = preprocessor.transform(X_val)

    # Log-transform target for more Gaussian residuals, this should help with outliers
    y_train = np.log1p(y_train_raw)
    y_val = np.log1p(y_val_raw)

    def make_params(overrides: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "max_depth": 8,
            "min_child_weight": 2.0,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "alpha": 0.0, # L1
            "lambda": 1.0, # L2
            "gamma": 0.0,
            "grow_policy": "lossguide",
        }
        if use_gpu:
            params.update({"device": "cuda", "tree_method": "hist"})
        else:
            params.update({"tree_method": "hist"})
        if overrides:
            lr = overrides.pop("learning_rate", None)
            if lr is not None:
                params["eta"] = lr
            params.update(overrides)
        return params

    # Random sampling
    def sample_params() -> Dict[str, Any]:
        return {
            "learning_rate": 10 ** rng.uniform(-2.3, -0.7), # ~0.005..0.2
            "max_depth": int(rng.randint(4, 13)), # 4..12
            "min_child_weight": float(10 ** rng.uniform(-0.3, 1.0)), # ~0.5..10
            "subsample": float(rng.uniform(0.6, 1.0)),
            "colsample_bytree": float(rng.uniform(0.6, 1.0)),
            "alpha": float(10 ** rng.uniform(-3, 1)), # 0.001..10
            "lambda": float(10 ** rng.uniform(-2, 1)), # 0.01..10
            "gamma": float(10 ** rng.uniform(-3, 0)), # 0.001..1
            "max_leaves": int(rng.randint(16, 512)),
        }

    # Helpers to use "best trees"
    def best_ntrees(booster: xgb.Booster) -> int:
        if hasattr(booster, "best_iteration") and booster.best_iteration is not None:
            return int(booster.best_iteration) + 1
        if hasattr(booster, "best_ntree_limit") and booster.best_ntree_limit:
            return int(booster.best_ntree_limit)
        return max_boost_rounds

    def predict_best(booster: xgb.Booster, dmat: xgb.DMatrix) -> np.ndarray:
        nbest = best_ntrees(booster)
        try:
            return booster.predict(dmat, iteration_range=(0, nbest))
        except TypeError:
            return booster.predict(dmat, ntree_limit=nbest)

    dtrain = xgb.DMatrix(X_train_t, label=y_train)
    dval = xgb.DMatrix(X_val_t, label=y_val)

    # Hyperparameter search
    best: Dict[str, Any] = {"rmse": math.inf, "params": None, "best_iteration": None, "booster": None}

    pbar = tqdm(range(n_trials), unit="trial", desc="XGB hyperparam search")
    for _ in pbar:
        trial = sample_params()
        params = make_params(trial)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=max_boost_rounds,
            evals=[(dval, "val")],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )
        # score on original scale
        val_pred = np.expm1(predict_best(booster, dval))
        rmse = float(np.sqrt(mean_squared_error(y_val_raw, val_pred)))

        if rmse < best["rmse"]:
            best.update({
                "rmse": rmse,
                "params": params,
                "best_iteration": best_ntrees(booster) - 1,  # store 0-based
                "booster": booster,
            })
        pbar.set_postfix(rmse=f"{rmse:.2f}", best=f"{best['rmse']:.2f}")

    # Refit with best params
    X_full_t = preprocessor.transform(X)
    y_full = np.log1p(y)
    dfull = xgb.DMatrix(X_full_t, label=y_full)

    final_params = dict(best["params"])
    final_rounds = int(best["best_iteration"]) + 1

    bst_final = xgb.train(
        params=final_params,
        dtrain=dfull,
        num_boost_round=final_rounds,
        verbose_eval=False,
    )

    # Adapter for the prediction
    class BoosterAdapter:
        def __init__(self, booster: xgb.Booster, ntrees: int):
            self.booster = booster
            self.best_iteration = ntrees - 1
        def predict(self, Xmat) -> np.ndarray:
            dmat = xgb.DMatrix(Xmat)
            try:
                return self.booster.predict(dmat, iteration_range=(0, self.best_iteration + 1))
            except TypeError:
                return self.booster.predict(dmat, ntree_limit=self.best_iteration + 1)
        def __repr__(self) -> str:
            return f"BoosterAdapter(best_iteration={self.best_iteration})"

    model_adapter = BoosterAdapter(bst_final, final_rounds)

    if save_dir:
        path = Path(save_dir)
        path.mkdir(parents=True, exist_ok=True)
        joblib.dump(preprocessor, path / "xgb_model.joblib")
        bst_final.save_model(str(path / "xgb_model.json"))
        with (path / "best_summary.json").open("w") as f:
            json.dump(
                {
                    "val_rmse": best["rmse"],
                    "best_iteration": int(best["best_iteration"]),
                    "num_boost_round": final_rounds,
                    "best_params": {
                        k: final_params[k]
                        for k in [
                            "eta", "max_depth", "min_child_weight", "subsample",
                            "colsample_bytree", "alpha", "lambda", "gamma",
                            "grow_policy", *(["device"] if "device" in final_params else []),
                            "tree_method",
                        ]
                        if k in final_params
                    },
                },
                f,
                indent=2,
            )

    return {
        "preprocessor": preprocessor,
        "model": model_adapter,
        "best_params": final_params,
        "best_iteration": int(best["best_iteration"]),
        "val_rmse": float(best["rmse"]),
    }

In [None]:
result = train_xgboost_model(
    train_df,
    target="resale_price",
    drop_cols=["resale_price", "month", "block", "street", "lease_commence_date"],
    save_dir="models/xgb_best",
    n_trials=40,
    use_gpu=True,
)
print(f"Best val RMSE: {result['val_rmse']:.2f}")

# Best val RMSE: 25566.88

### prediction

In [None]:
loaded_model = joblib.load("xgb_model.joblib")
drop_cols = [target, 'month', 'block', 'street', 'lease_commence_date']
X_sub = test_df.drop(columns=["resale_price", "month", "block", "street", "lease_commence_date"], errors="ignore")
X_sub_t = result["preprocessor"].transform(X_sub)
final_predictions = np.expm1(result["model"].predict(X_sub_t))
results_df = pd.DataFrame({'id': test_df.index,
                           'Predicted': final_predictions})
results_df.to_csv("submission.csv")

In [None]:
loaded_model = joblib.load("random_forest.joblib") # load saved model

X_submission_test = test_df[X.columns]

final_predictions = loaded_model.predict(X_submission_test)

results_df = pd.DataFrame({
    'id': test_df.index,
    'predicted_resale_price': final_predictions
})

results_df

Unnamed: 0,id,predicted_resale_price
0,0,521208.96
1,1,611036.64
2,2,402893.04
3,3,440106.64
4,4,514306.64
...,...,...
49995,49995,343973.00
49996,49996,427465.52
49997,49997,519874.25
49998,49998,582948.88
