In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import TargetEncoder

In [2]:
train_set = pd.read_csv('train.csv')

train_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


### data cleaning

In [11]:
train_set_cleaned.head(10)

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,lease_date_cat
0,2021-09,jurong east,257,jurong east street 24,3-room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600,2021,9,80s
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,2022,5,70s
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,2022,10,70s
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,2021,8,90s
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,2022,11,70s
5,2023-04,bukit panjang,654,senja road,executive,premium apartment,130.0,yes,2001,1.387847,103.764249,0.0,saujana,bukit panjang,west region,2300,2023,4,00s
6,2021-01,sengkang,407b,fernvale road,5-room,premium apartment,110.0,yes,2005,1.388997,103.875148,0.0,fernvale,sengkang,north-east region,2100,2021,1,00s
7,2022-06,ang mo kio,223,ang mo kio avenue 1,3-room,new generation,67.0,yes,1978,1.366048,103.838123,0.0,shangri-la,ang mo kio,north-east region,2300,2022,6,70s
8,2021-10,bishan,149,bishan street 11,4-room,simplified,84.0,yes,1987,1.344279,103.855556,0.0,bishan east,bishan,central region,2100,2021,10,80s
9,2021-04,punggol,133,edgedale plains,5-room,premium apartment,112.0,yes,2003,1.392832,103.91062,0.0,punggol field,punggol,north-east region,2100,2021,4,00s


In [7]:
train_set_cleaned['rent_approval_year'].value_counts()

2021    24909
2022    21399
2023    13692
Name: rent_approval_year, dtype: int64

In [3]:
# get copy

train_set_cleaned = train_set.copy()

In [5]:
# add two new attributes, split rent_approval_date into month, year

train_set_cleaned[['rent_approval_year','rent_approval_month']] = train_set_cleaned['rent_approval_date'].str.split('-',expand=True)

# train_set_cleaned['rent_approval_month'] = pd.to_datetime(train_set_cleaned['rent_approval_month'], format='%m').dt.month
# train_set_cleaned['rent_approval_year'] = pd.to_datetime(train_set_cleaned['rent_approval_year'], format='%y').dt.year

In [8]:
# street name to lower case

train_set_cleaned['street_name'] = train_set_cleaned['street_name'].apply(str.lower)

In [9]:
# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')

train_set_cleaned['flat_type'] = train_set_cleaned['flat_type'].apply(lambda x: x.replace(' ', '-'))

In [10]:
def date_filter_condition(x):
    
    if x < 1970:
        return 'before 70s'
    elif x >= 1970 and x < 1980:
        return '70s'
    elif x >= 1980 and x < 1990:
        return '80s'
    elif x >= 1990 and x < 2000:
        return '90s'
    elif x >= 2000 and x < 2010:
        return '00s'
    elif x >= 2010 and x < 2020:
        return '10s'
    else:
        return 'others'

# categorize lease commence date by decades    
train_set_cleaned['lease_date_cat'] = train_set_cleaned['lease_commence_date'].apply(date_filter_condition)

### encoding

In [12]:
# create a new dataframe
encoded_train_set = train_set_cleaned[['monthly_rent']].copy()

#### spatial information

In [13]:
# encode regions with the mean of target variable for each region

region_encoder = TargetEncoder()
encoded_train_set['region_encoded'] = region_encoder.fit_transform(train_set_cleaned['region'], train_set_cleaned['monthly_rent'])

In [14]:
# encode planning areas with target variable information

planning_area_encoder = TargetEncoder()
encoded_train_set['planning_area_encoded'] = planning_area_encoder.fit_transform(train_set_cleaned['planning_area'], train_set_cleaned['monthly_rent'])

In [15]:
# encode subzones with target variable information

subzone_encoder = TargetEncoder()
encoded_train_set['subzone_encoded'] = subzone_encoder.fit_transform(train_set_cleaned['subzone'], train_set_cleaned['monthly_rent'])

In [16]:
# encode regions with target variable information

street_encoder = TargetEncoder()
encoded_train_set['street_encoded'] = street_encoder.fit_transform(train_set_cleaned['street_name'], train_set_cleaned['monthly_rent'])

In [17]:
# spatial hierarchical correlation

region_planning_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['planning_area_encoded'])
planning_subzone_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['subzone_encoded'])
subzone_street_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['street_encoded'])

print(f'The correlation between region and planning area is {region_planning_corr:3f}')
print(f'The correlation between planning area and subzone is {planning_subzone_corr:3f}')
print(f'The correlation between subzone and street is {subzone_street_corr:3f}')

The correlation between region and planning area is 0.579100
The correlation between planning area and subzone is 0.669665
The correlation between subzone and street is 0.828036


In [18]:
# spatial information correlation with monthly rent

region_rent_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['monthly_rent'])
planning_rent_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['monthly_rent'])
subzone_rent_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['monthly_rent'])
street_rent_corr = encoded_train_set['street_encoded'].corr(encoded_train_set['monthly_rent'])

print(f'Region and Monthly Rental correlation is {region_rent_corr:3f}')
print(f'Planning Area and Monthly Rental correlation is {planning_rent_corr:3f}')
print(f'Subzone and Monthly Rental correlation is {subzone_rent_corr:3f}')
print(f'Street and Monthly Rental correlation is {street_rent_corr:3f}')

Region and Monthly Rental correlation is 0.124776
Planning Area and Monthly Rental correlation is 0.215462
Subzone and Monthly Rental correlation is 0.320711
Street and Monthly Rental correlation is 0.374731


#### rent date & lease date

In [19]:
# rental approval date

rental_encoder = TargetEncoder()
encoded_train_set['rental_approval_date_encoded'] = rental_encoder.fit_transform(train_set_cleaned['rent_approval_date'].astype(str), train_set_cleaned['monthly_rent'])

rental_date_rent_corr = encoded_train_set['rental_approval_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Date and Monthly Rental correlation is {rental_date_rent_corr:3f}')

Rental Approval Date and Monthly Rental correlation is 0.546074


In [20]:
# rental approval year

rental_year_encoder = TargetEncoder()
encoded_train_set['rental_approval_year_encoded'] = rental_year_encoder.fit_transform(train_set_cleaned['rent_approval_year'].astype(str), train_set_cleaned['monthly_rent'])

rental_year_rent_corr = encoded_train_set['rental_approval_year_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Year and Monthly Rental correlation is {rental_year_rent_corr:3f}')

Rental Approval Year and Monthly Rental correlation is 0.504737


In [21]:
# rental approval month

rental_month_encoder = TargetEncoder()
encoded_train_set['rental_approval_month_encoded'] = rental_month_encoder.fit_transform(train_set_cleaned['rent_approval_month'].astype(str), train_set_cleaned['monthly_rent'])

rental_month_rent_corr = encoded_train_set['rental_approval_month_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Month and Monthly Rental correlation is {rental_month_rent_corr:3f}')

Rental Approval Month and Monthly Rental correlation is 0.106482


In [22]:
# lease commence year (in decade)

lease_encoder = TargetEncoder()
encoded_train_set['lease_commence_date_encoded'] = lease_encoder.fit_transform(train_set_cleaned['lease_date_cat'], train_set_cleaned['monthly_rent'])

lease_date_rent_corr = encoded_train_set['lease_commence_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Lease Commence Date and Monthly Rental correlation is {lease_date_rent_corr:3f}')

Lease Commence Date and Monthly Rental correlation is 0.223300


#### flat_type and flat_model

In [9]:
train_set_cleaned['flat_type'].value_counts()

4-room       21808
3-room       18747
5-room       14724
executive     3525
2-room         923
Name: flat_type, dtype: int64

In [24]:
# flat type

flat_type_encoder = TargetEncoder()
encoded_train_set['flat_type_encoded'] = flat_type_encoder.fit_transform(train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

flat_type_rent_corr = encoded_train_set['flat_type_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type and Monthly Rental correlation is {flat_type_rent_corr:3f}')

Flat Type and Monthly Rental correlation is 0.346146


In [25]:
# flat model

flat_model_encoder = TargetEncoder()
encoded_train_set['flat_model_encoded'] = flat_model_encoder.fit_transform(train_set_cleaned['flat_model'], train_set_cleaned['monthly_rent'])

flat_model_rent_corr = encoded_train_set['flat_model_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Model and Monthly Rental correlation is {flat_model_rent_corr:3f}')

Flat Model and Monthly Rental correlation is 0.236876


#### floor area

In [26]:
# floor area sqm
encoded_train_set['floor_area_sqm'] = train_set_cleaned['floor_area_sqm'].copy()

### normalization

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
encoded_train_set

Unnamed: 0,monthly_rent,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,flat_type_encoded,flat_model_encoded,floor_area_sqm
0,1600,2569.167537,2595.146199,2542.158516,2312.179832,2233.926780,2225.773817,2489.108495,2479.803864,2276.033233,2369.965462,67.0
1,2250,2570.667785,2438.227223,2360.371046,2404.212860,2517.128874,2651.014066,2618.130520,2421.705462,2692.359176,2369.965462,92.0
2,1900,2737.201353,2516.680515,2808.893871,2403.464419,2928.483245,2651.014066,2563.328013,2421.705462,2276.033233,2636.211052,67.0
3,2850,2570.667785,2686.857477,2610.338573,2757.834101,2249.901768,2225.773817,2470.895522,2700.899570,2892.857143,2878.725962,149.0
4,2100,2737.201353,2702.635659,2793.525180,2407.998266,2986.739659,2651.014066,2611.993243,2421.705462,2276.033233,2636.211052,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2200,2558.822710,2416.700057,2390.887097,2336.012658,2233.926780,2225.773817,2489.108495,2421.705462,2276.033233,2369.965462,67.0
59996,4100,2737.201353,2904.113924,2694.936709,2763.731680,3178.128128,3158.694858,2608.904470,2880.707364,2692.359176,2612.031305,83.0
59997,2250,2570.667785,2638.489123,2602.823315,2509.602223,2582.606383,2651.014066,2658.595055,2479.803864,2815.593875,2636.211052,122.0
59998,4700,2570.667785,2438.227223,2434.379786,2591.443246,3069.581639,3158.694858,2490.553580,2421.705462,2815.593875,2444.223986,123.0


In [29]:
# initialize the feature scaler
scaler = StandardScaler()

normalized_features = scaler.fit_transform(encoded_train_set.iloc[:,1:])

In [30]:
normalized_features.shape

(60000, 11)

In [31]:
# normalize target variable
y_scaler = StandardScaler()

target_variable = y_scaler.fit_transform(encoded_train_set['monthly_rent'].to_numpy().reshape(-1, 1))
target_variable = target_variable.reshape(-1)

In [32]:
target_variable.shape

(60000,)

### training

In [33]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

#### lasso regression model

In [34]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Lasso Regression with cross-validation
lasso = LassoCV(cv=kfold, random_state=42)
lasso.fit(normalized_features, target_variable)

LassoCV(cv=KFold(n_splits=5, random_state=42, shuffle=True), random_state=42)

In [35]:
# evaluate the model

print("Optimal alpha:", lasso.alpha_)
print("Mean cross-validated score of the best estimator:", lasso.score(normalized_features, target_variable))

Optimal alpha: 0.0022045081649800934
Mean cross-validated score of the best estimator: 0.5026472684759563


In [37]:
# making predictions 
y_pred = lasso.predict(normalized_features)

# evaluating the model
mse = mean_squared_error(target_variable, y_pred)
print(f"Lasso Regression Mean Squared Error: {mse}")
print(f"Model Coefficients: \n{lasso.coef_}")

Lasso Regression Mean Squared Error: 0.4973527315240436
Model Coefficients: 
[ 1.07341824e-01  8.02923488e-04  3.84111326e-02  1.94728342e-01
  5.31511063e-01  1.09286102e-02 -2.66962319e-04 -0.00000000e+00
  2.56129575e-01  0.00000000e+00  5.47936350e-02]


#### graident boosting tree

In [38]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# implement gradient boosting tree with cross validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(gbr, normalized_features, target_variable, cv=cv, scoring='neg_mean_squared_error')

gbr.fit(normalized_features, target_variable)

GradientBoostingRegressor(random_state=42)

In [39]:
mean_mse = np.mean(-cross_val_scores)
print(f"Gradient Boosting Regressor Mean Squared Error: {mean_mse}")

Gradient Boosting Regressor Mean Squared Error: 0.46812061212081335


In [40]:
print("Mean cross-validated score of the best regressor:", gbr.score(normalized_features, target_variable))

Mean cross-validated score of the best regressor: 0.5393054798182728


### test set evaluation

In [41]:
test_set = pd.read_csv('test.csv')

test_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [103]:
(test_set.isna().sum(axis=1) > 0).sum()

0

In [42]:
# data preparation

test_set[['rent_approval_year','rent_approval_month']] = test_set['rent_approval_date'].str.split('-',expand=True)

test_set['rent_approval_date'] = test_set['rent_approval_date'].astype('Period[M]')

# street name to lower case
test_set['street_name'] = test_set['street_name'].apply(str.lower)

# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')
test_set['flat_type'] = test_set['flat_type'].apply(lambda x: x.replace(' ', '-'))

# categorize lease commence date by decades (remember to date_filter_condition function above first)
test_set['lease_date_cat'] = test_set['lease_commence_date'].apply(date_filter_condition)

In [43]:
# encoding test set

encoded_test_set = pd.DataFrame(index=test_set.index)

encoded_test_set['region_encoded'] = region_encoder.transform(test_set['region'])
encoded_test_set['planning_area_encoded'] = planning_area_encoder.transform(test_set['planning_area'])
encoded_test_set['subzone_encoded'] = subzone_encoder.transform(test_set['subzone'])
encoded_test_set['street_encoded'] = street_encoder.transform(test_set['street_name'])

encoded_test_set['rental_approval_date_encoded'] = rental_encoder.transform(test_set['rent_approval_date'])
encoded_test_set['rental_approval_year_encoded'] = rental_year_encoder.transform(test_set['rent_approval_year'])
encoded_test_set['rental_approval_month_encoded'] = rental_month_encoder.transform(test_set['rent_approval_month'])

encoded_test_set['lease_commence_date_encoded'] = lease_encoder.transform(test_set['lease_date_cat'])

encoded_test_set['flat_type_encoded'] = flat_type_encoder.transform(test_set['flat_type'])
encoded_test_set['flat_model_encoded'] = flat_model_encoder.transform(test_set['flat_model'])

encoded_test_set['floor_area_sqm'] = test_set['floor_area_sqm'].copy()



In [44]:
encoded_test_set.head(10)

Unnamed: 0,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,flat_type_encoded,flat_model_encoded,floor_area_sqm
0,2558.82271,2503.252886,2427.604167,2348.993316,2590.328333,3158.694858,2490.55358,2479.803864,2815.593875,2636.211052,121.0
1,2450.623806,2540.49101,2592.33279,2640.043972,2590.328333,2651.014066,2489.108495,2700.89957,2692.359176,2612.031305,100.0
2,2569.167537,2646.808979,2395.588235,2532.195122,2590.328333,3158.694858,2729.54235,2479.803864,2692.359176,2369.965462,91.0
3,2569.167537,2595.146199,2400.15015,2780.896042,2590.328333,2225.773817,2470.895522,2479.803864,2276.033233,2612.031305,74.0
4,2569.167537,2595.146199,2400.15015,2268.56436,2590.328333,2651.014066,2543.250298,2479.803864,2815.593875,2636.211052,121.0
5,2569.167537,2646.808979,2395.588235,2229.881779,2590.328333,2651.014066,2490.55358,2479.803864,2276.033233,2369.965462,67.0
6,2558.82271,2665.555556,2654.294479,2590.503432,2590.328333,2225.773817,2618.13052,2742.515391,2815.593875,2709.678998,110.0
7,2570.667785,2638.489123,2602.085865,3081.579221,2590.328333,2651.014066,2470.895522,2742.515391,2815.593875,3150.404313,108.0
8,2569.167537,2640.890551,2873.930481,2820.173278,2590.328333,2225.773817,2680.680614,2742.515391,2892.857143,2709.678998,133.0
9,2737.201353,2585.947712,2585.947712,2480.479822,2590.328333,3158.694858,2729.54235,2421.705462,2815.593875,2444.223986,120.0


In [45]:
# normalization

normalized_test_features = scaler.transform(encoded_test_set)

In [53]:
# lasso regressor prediction
lasso_y_pred = lasso.predict(normalized_test_features)
lasso_y_pred = y_scaler.inverse_transform(lasso_y_pred)

In [54]:
lasso_y_pred[:20]

array([2638.07941832, 2584.61883667, 2601.29352993, 2379.70944645,
       2589.78253261, 2081.74073879, 2756.97425016, 3030.36795825,
       3009.71426825, 2879.12616047, 2494.58355694, 2329.07311285,
       2537.64397099, 2757.65927939, 2645.87395555, 2684.60741198,
       2237.91952287, 2989.70775342, 2765.80046726, 2648.36433851])

In [50]:
# XGBoost prediction
gbr_y_pred = gbr.predict(normalized_test_features)
gbr_y_pred = y_scaler.inverse_transform(gbr_y_pred)

In [52]:
gbr_y_pred[:30]

array([2623.72026836, 2591.650405  , 2619.88358809, 2299.96589769,
       2618.31453875, 2240.84711411, 2632.62192082, 3024.28558682,
       2883.93607308, 2947.91970038, 2433.53263796, 2244.79164793,
       2736.96339214, 2660.4886048 , 2606.38332563, 2575.21268777,
       2228.43972603, 3075.60306771, 2737.43612658, 2632.43488934,
       2185.68289684, 2417.72984217, 2674.25231862, 2427.79251577,
       2448.82243865, 2315.37280209, 2810.64536438, 2624.33752873,
       2449.50436559, 2675.58103808])