In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import TargetEncoder

In [2]:
train_set = pd.read_csv('Datasets/train.csv')

train_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


### data cleaning

In [3]:
# get copy

train_set_cleaned = train_set.copy()

In [4]:
# add two new attributes, split rent_approval_date into month, year

train_set_cleaned[['rent_approval_year','rent_approval_month']] = train_set_cleaned['rent_approval_date'].str.split('-',expand=True)

# train_set_cleaned['rent_approval_month'] = pd.to_datetime(train_set_cleaned['rent_approval_month'], format='%m').dt.month
# train_set_cleaned['rent_approval_year'] = pd.to_datetime(train_set_cleaned['rent_approval_year'], format='%y').dt.year

In [5]:
# street name to lower case

train_set_cleaned['street_name'] = train_set_cleaned['street_name'].apply(str.lower)

In [6]:
# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')

train_set_cleaned['flat_type'] = train_set_cleaned['flat_type'].apply(lambda x: x.replace(' ', '-'))

In [7]:
def date_filter_condition(x):
    
    if x < 1970:
        return 'before 70s'
    elif x >= 1970 and x < 1980:
        return '70s'
    elif x >= 1980 and x < 1990:
        return '80s'
    elif x >= 1990 and x < 2000:
        return '90s'
    elif x >= 2000 and x < 2010:
        return '00s'
    elif x >= 2010 and x < 2020:
        return '10s'
    else:
        return 'others'

# categorize lease commence date by decades    
train_set_cleaned['lease_date_cat'] = train_set_cleaned['lease_commence_date'].apply(date_filter_condition)

In [8]:
train_set_cleaned.head(10)

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,lease_date_cat
0,2021-09,jurong east,257,jurong east street 24,3-room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600,2021,9,80s
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,2022,5,70s
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,2022,10,70s
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,2021,8,90s
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,2022,11,70s
5,2023-04,bukit panjang,654,senja road,executive,premium apartment,130.0,yes,2001,1.387847,103.764249,0.0,saujana,bukit panjang,west region,2300,2023,4,00s
6,2021-01,sengkang,407b,fernvale road,5-room,premium apartment,110.0,yes,2005,1.388997,103.875148,0.0,fernvale,sengkang,north-east region,2100,2021,1,00s
7,2022-06,ang mo kio,223,ang mo kio avenue 1,3-room,new generation,67.0,yes,1978,1.366048,103.838123,0.0,shangri-la,ang mo kio,north-east region,2300,2022,6,70s
8,2021-10,bishan,149,bishan street 11,4-room,simplified,84.0,yes,1987,1.344279,103.855556,0.0,bishan east,bishan,central region,2100,2021,10,80s
9,2021-04,punggol,133,edgedale plains,5-room,premium apartment,112.0,yes,2003,1.392832,103.91062,0.0,punggol field,punggol,north-east region,2100,2021,4,00s


In [9]:
train_set_cleaned['rent_approval_year'].value_counts()

2021    24909
2022    21399
2023    13692
Name: rent_approval_year, dtype: int64

### encoding

In [9]:
# create a new dataframe
encoded_train_set = train_set_cleaned[['monthly_rent']].copy()

#### spatial information

In [10]:
# encode regions with the mean of target variable for each region

region_encoder = TargetEncoder()
encoded_train_set['region_encoded'] = region_encoder.fit_transform(train_set_cleaned['region'], train_set_cleaned['monthly_rent'])

In [11]:
# encode planning areas with target variable information

planning_area_encoder = TargetEncoder()
encoded_train_set['planning_area_encoded'] = planning_area_encoder.fit_transform(train_set_cleaned['planning_area'], train_set_cleaned['monthly_rent'])

In [12]:
# encode subzones with target variable information

subzone_encoder = TargetEncoder()
encoded_train_set['subzone_encoded'] = subzone_encoder.fit_transform(train_set_cleaned['subzone'], train_set_cleaned['monthly_rent'])

In [13]:
# encode streets with target variable information

street_encoder = TargetEncoder()
encoded_train_set['street_encoded'] = street_encoder.fit_transform(train_set_cleaned['street_name'], train_set_cleaned['monthly_rent'])

In [14]:
## encode block (with street name) with target variable information

block_encoder = TargetEncoder()
encoded_train_set['block_encoded'] = block_encoder.fit_transform(train_set_cleaned['street_name']+train_set_cleaned['block'], train_set_cleaned['monthly_rent'])

In [15]:
# spatial hierarchical correlation

region_planning_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['planning_area_encoded'])
planning_subzone_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['subzone_encoded'])
subzone_street_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['street_encoded'])

print(f'The correlation between region and planning area is {region_planning_corr:3f}')
print(f'The correlation between planning area and subzone is {planning_subzone_corr:3f}')
print(f'The correlation between subzone and street is {subzone_street_corr:3f}')

The correlation between region and planning area is 0.579100
The correlation between planning area and subzone is 0.669665
The correlation between subzone and street is 0.828036


In [16]:
# spatial information correlation with monthly rent

region_rent_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['monthly_rent'])
planning_rent_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['monthly_rent'])
subzone_rent_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['monthly_rent'])
street_rent_corr = encoded_train_set['street_encoded'].corr(encoded_train_set['monthly_rent'])
block_rent_corr = encoded_train_set['block_encoded'].corr(encoded_train_set['monthly_rent'])

print(f'Region and Monthly Rental correlation is {region_rent_corr:3f}')
print(f'Planning Area and Monthly Rental correlation is {planning_rent_corr:3f}')
print(f'Subzone and Monthly Rental correlation is {subzone_rent_corr:3f}')
print(f'Street and Monthly Rental correlation is {street_rent_corr:3f}')
print(f'Block and Monthly Rental correlation is {block_rent_corr:3f}')

Region and Monthly Rental correlation is 0.124776
Planning Area and Monthly Rental correlation is 0.215462
Subzone and Monthly Rental correlation is 0.320711
Street and Monthly Rental correlation is 0.374731
Block and Monthly Rental correlation is 0.521544


#### rent date & lease date

In [17]:
# rental approval date

rental_encoder = TargetEncoder()
encoded_train_set['rental_approval_date_encoded'] = rental_encoder.fit_transform(train_set_cleaned['rent_approval_date'].astype(str), train_set_cleaned['monthly_rent'])

rental_date_rent_corr = encoded_train_set['rental_approval_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Date and Monthly Rental correlation is {rental_date_rent_corr:3f}')

Rental Approval Date and Monthly Rental correlation is 0.546074


In [18]:
# rental approval year

rental_year_encoder = TargetEncoder()
encoded_train_set['rental_approval_year_encoded'] = rental_year_encoder.fit_transform(train_set_cleaned['rent_approval_year'].astype(str), train_set_cleaned['monthly_rent'])

rental_year_rent_corr = encoded_train_set['rental_approval_year_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Year and Monthly Rental correlation is {rental_year_rent_corr:3f}')

Rental Approval Year and Monthly Rental correlation is 0.504737


In [19]:
# rental approval month

rental_month_encoder = TargetEncoder()
encoded_train_set['rental_approval_month_encoded'] = rental_month_encoder.fit_transform(train_set_cleaned['rent_approval_month'].astype(str), train_set_cleaned['monthly_rent'])

rental_month_rent_corr = encoded_train_set['rental_approval_month_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Month and Monthly Rental correlation is {rental_month_rent_corr:3f}')

Rental Approval Month and Monthly Rental correlation is 0.106482


In [20]:
# lease commence year (in decade)

lease_encoder = TargetEncoder()
encoded_train_set['lease_commence_date_encoded'] = lease_encoder.fit_transform(train_set_cleaned['lease_date_cat'], train_set_cleaned['monthly_rent'])

lease_date_rent_corr = encoded_train_set['lease_commence_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Lease Commence Date and Monthly Rental correlation is {lease_date_rent_corr:3f}')

Lease Commence Date and Monthly Rental correlation is 0.223300


In [21]:
## rental date + type (NOT good for GBR)

type_by_date = TargetEncoder()
encoded_train_set['type_by_date_encoded'] = type_by_date.fit_transform(train_set_cleaned['rent_approval_date']+train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

type_by_date_rent_corr = encoded_train_set['type_by_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type by Rental Date and Monthly Rental correlation is {type_by_date_rent_corr:3f}')

Flat Type by Rental Date and Monthly Rental correlation is 0.655330


#### flat_type and flat_model

In [21]:
train_set_cleaned['flat_type'].value_counts()

4-room       21889
3-room       18897
5-room       14759
executive     3528
2-room         927
Name: flat_type, dtype: int64

In [22]:
# flat type

flat_type_encoder = TargetEncoder()
encoded_train_set['flat_type_encoded'] = flat_type_encoder.fit_transform(train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

flat_type_rent_corr = encoded_train_set['flat_type_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type and Monthly Rental correlation is {flat_type_rent_corr:3f}')

Flat Type and Monthly Rental correlation is 0.346146


In [23]:
# flat model

flat_model_encoder = TargetEncoder()
encoded_train_set['flat_model_encoded'] = flat_model_encoder.fit_transform(train_set_cleaned['flat_model'], train_set_cleaned['monthly_rent'])

flat_model_rent_corr = encoded_train_set['flat_model_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Model and Monthly Rental correlation is {flat_model_rent_corr:3f}')

Flat Model and Monthly Rental correlation is 0.236876


In [24]:
## street + type

type_by_street = TargetEncoder()
encoded_train_set['type_by_street_encoded'] = type_by_street.fit_transform(train_set_cleaned['street_name']+train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

type_by_street_rent_corr = encoded_train_set['type_by_street_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type by Street and Monthly Rental correlation is {type_by_street_rent_corr:3f}')

Flat Type by Street and Monthly Rental correlation is 0.481737


#### floor area

In [25]:
# floor area sqm
encoded_train_set['floor_area_sqm'] = train_set_cleaned['floor_area_sqm'].copy()

floor_area_sqm_corr = encoded_train_set['floor_area_sqm'].corr(encoded_train_set['monthly_rent'])
print(f'Floor Area in Sqm and Monthly Rental correlation is {floor_area_sqm_corr:3f}')

Floor Area in Sqm and Monthly Rental correlation is 0.306466


### normalization

In [26]:
from sklearn.preprocessing import StandardScaler

In [63]:
encoded_train_set

Unnamed: 0,monthly_rent,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,block_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,flat_type_encoded,flat_model_encoded,type_by_street_encoded,floor_area_sqm
0,1600,2569.167537,2595.146199,2542.158516,2312.179832,2319.910527,2233.926780,2225.773817,2489.108495,2479.803864,2276.033233,2369.965462,2237.215820,67.0
1,2250,2570.667785,2438.227223,2360.371046,2404.212860,2596.971112,2517.128874,2651.014066,2618.130520,2421.705462,2692.359176,2369.965462,2591.822430,92.0
2,1900,2737.201353,2516.680515,2808.893871,2403.464419,2317.549823,2928.483245,2651.014066,2563.328013,2421.705462,2276.033233,2636.211052,2301.216814,67.0
3,2850,2570.667785,2686.857477,2610.338573,2757.834101,2678.043573,2249.901768,2225.773817,2470.895522,2700.899570,2892.857143,2878.725962,2840.416063,149.0
4,2100,2737.201353,2702.635659,2793.525180,2407.998266,2407.998266,2986.739659,2651.014066,2611.993243,2421.705462,2276.033233,2636.211052,2407.998266,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2200,2558.822710,2416.700057,2390.887097,2336.012658,2549.933369,2233.926780,2225.773817,2489.108495,2421.705462,2276.033233,2369.965462,2206.830123,67.0
59996,4100,2737.201353,2904.113924,2694.936709,2763.731680,2784.903851,3178.128128,3158.694858,2608.904470,2880.707364,2692.359176,2612.031305,2745.330132,83.0
59997,2250,2570.667785,2638.489123,2602.823315,2509.602223,2651.586378,2582.606383,2651.014066,2658.595055,2479.803864,2815.593875,2636.211052,2541.557159,122.0
59998,4700,2570.667785,2438.227223,2434.379786,2591.443246,2766.882904,3069.581639,3158.694858,2490.553580,2421.705462,2815.593875,2444.223986,2743.101626,123.0


In [64]:
# initialize the feature scaler
scaler = StandardScaler()

normalized_features = scaler.fit_transform(encoded_train_set.iloc[:,1:])

In [28]:
df_describe = pd.DataFrame(normalized_features)
df_describe.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,3.504271e-15,3.138378e-15,1.884108e-15,-1.100081e-15,8.51208e-16,-2.717411e-15,-7.782885e-16,1.624649e-15,6.510163e-16,1.377269e-16,4.176437e-16
std,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008
min,-1.566138,-1.248074,-1.997123,-2.29649,-1.136227,-1.010297,-1.568908,-2.255897,-2.991037,-3.995729,-2.511392
25%,-0.3531893,-0.8052347,-0.6691684,-0.678632,-0.8720157,-1.010297,-0.6184322,-0.692344,-1.270077,-0.7507854,-0.8919552
50%,-0.2372201,0.03131269,-0.01654897,-0.1316916,-0.3898313,0.1681796,0.2440221,-0.692344,0.41231,0.1323593,-0.0614746
75%,-0.2204018,0.3667341,0.2406733,0.4403614,1.227626,0.1681796,0.8967736,0.9533255,0.9103054,0.2772886,0.6444339
max,1.646499,6.29796,6.454196,6.316627,1.72115,1.575125,1.82876,1.818984,1.222529,10.3054,5.004457


In [29]:
# normalize target variable
y_scaler = StandardScaler()

target_variable = y_scaler.fit_transform(encoded_train_set['monthly_rent'].to_numpy().reshape(-1, 1))
target_variable = target_variable.reshape(-1)

In [30]:
df_describe = pd.DataFrame(target_variable)
df_describe.describe()

Unnamed: 0,0
count,60000.0
mean,-2.100866e-16
std,1.000008
min,-3.203684
25%,-0.6858655
50%,-0.266229
75%,0.5730439
max,6.098257


### training

In [30]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [65]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(normalized_features, target_variable, test_size=0.33, random_state=42)

#### lasso regression model

In [78]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Lasso Regression with cross-validation
lasso = LassoCV(cv=kfold, random_state=42)
lasso.fit(X_train, y_train)

LassoCV(cv=KFold(n_splits=5, random_state=42, shuffle=True), random_state=42)

In [84]:
# evaluate the model

print("Optimal alpha:", lasso.alpha_)
print("Mean cross-validated score of the best estimator:", lasso.score(X_test, y_test))

Optimal alpha: 0.00054675244381139
Mean cross-validated score of the best estimator: 0.5550698993935823


In [85]:
# making predictions 
y_pred = lasso.predict(X_test)

# evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Lasso Regression Mean Squared Error: {mse}")
print(f"Model Coefficients: \n{lasso.coef_}")

Lasso Regression Mean Squared Error: 0.4461977721338544
Model Coefficients: 
[ 0.06919061 -0.00230139 -0.02300714 -0.00522218  0.31699328  0.50549418
  0.0141737  -0.00250901 -0.00958236  0.05598543 -0.01657795  0.15722741
  0.08057235]


#### graident boosting tree

In [83]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# implement gradient boosting tree with cross validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(gbr, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')

gbr.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [86]:
mean_mse = np.mean(-cross_val_scores)
print(f"Gradient Boosting Regressor Mean Squared Error: {mean_mse}")

Gradient Boosting Regressor Mean Squared Error: 0.4008180550041719


In [87]:
print("Mean cross-validated score of the best regressor:", gbr.score(X_test, y_test))

Mean cross-validated score of the best regressor: 0.5972688419230905


### test set evaluation

In [71]:
test_set = pd.read_csv('Datasets/test.csv')

test_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [39]:
(test_set.isna().sum(axis=1) > 0).sum()

0

In [72]:
# data preparation

test_set[['rent_approval_year','rent_approval_month']] = test_set['rent_approval_date'].str.split('-',expand=True)

# test_set['rent_approval_date'] = test_set['rent_approval_date'].astype('Period[M]')

# street name to lower case
test_set['street_name'] = test_set['street_name'].apply(str.lower)

# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')
test_set['flat_type'] = test_set['flat_type'].apply(lambda x: x.replace(' ', '-'))

# categorize lease commence date by decades (remember to date_filter_condition function above first)
test_set['lease_date_cat'] = test_set['lease_commence_date'].apply(date_filter_condition)

In [73]:
# encoding test set

encoded_test_set = pd.DataFrame(index=test_set.index)

encoded_test_set['region_encoded'] = region_encoder.transform(test_set['region'])
encoded_test_set['planning_area_encoded'] = planning_area_encoder.transform(test_set['planning_area'])
encoded_test_set['subzone_encoded'] = subzone_encoder.transform(test_set['subzone'])
encoded_test_set['street_encoded'] = street_encoder.transform(test_set['street_name'])
encoded_test_set['block_encoded'] = block_encoder.transform(test_set['street_name']+test_set['block'])

encoded_test_set['rental_approval_date_encoded'] = rental_encoder.transform(test_set['rent_approval_date'])
encoded_test_set['rental_approval_year_encoded'] = rental_year_encoder.transform(test_set['rent_approval_year'])
encoded_test_set['rental_approval_month_encoded'] = rental_month_encoder.transform(test_set['rent_approval_month'])

encoded_test_set['lease_commence_date_encoded'] = lease_encoder.transform(test_set['lease_date_cat'])

encoded_test_set['flat_type_encoded'] = flat_type_encoder.transform(test_set['flat_type'])
encoded_test_set['flat_model_encoded'] = flat_model_encoder.transform(test_set['flat_model'])
encoded_test_set['type_by_street_encoded'] = type_by_street.transform(test_set['street_name']+test_set['flat_type'])

encoded_test_set['floor_area_sqm'] = test_set['floor_area_sqm'].copy()



In [74]:
encoded_test_set.head(10)

Unnamed: 0,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,block_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,flat_type_encoded,flat_model_encoded,type_by_street_encoded,floor_area_sqm
0,2558.82271,2503.252886,2427.604167,2348.993316,2556.237503,3069.581639,3158.694858,2490.55358,2479.803864,2815.593875,2636.211052,2556.237503,121.0
1,2450.623806,2540.49101,2592.33279,2640.043972,2509.793159,2812.587413,2651.014066,2489.108495,2700.89957,2692.359176,2612.031305,2614.003638,100.0
2,2569.167537,2646.808979,2395.588235,2532.195122,2626.547123,3262.248898,3158.694858,2729.54235,2479.803864,2692.359176,2369.965462,2792.881615,91.0
3,2569.167537,2595.146199,2400.15015,2780.896042,2540.333848,2249.901768,2225.773817,2470.895522,2479.803864,2276.033233,2612.031305,2565.461091,74.0
4,2569.167537,2595.146199,2400.15015,2268.56436,2542.787517,2449.32243,2651.014066,2543.250298,2479.803864,2815.593875,2636.211052,2631.5261,121.0
5,2569.167537,2646.808979,2395.588235,2229.881779,2629.794822,2351.327643,2651.014066,2490.55358,2479.803864,2276.033233,2369.965462,2176.899154,67.0
6,2558.82271,2665.555556,2654.294479,2590.503432,2620.45403,2216.537301,2225.773817,2618.13052,2742.515391,2815.593875,2709.678998,2616.033755,110.0
7,2570.667785,2638.489123,2602.085865,3081.579221,2730.005494,2755.309735,2651.014066,2470.895522,2742.515391,2815.593875,3150.404313,3224.684165,108.0
8,2569.167537,2640.890551,2873.930481,2820.173278,2649.937911,2339.852217,2225.773817,2680.680614,2742.515391,2892.857143,2709.678998,2671.955887,133.0
9,2737.201353,2585.947712,2585.947712,2480.479822,2808.073641,3262.248898,3158.694858,2729.54235,2421.705462,2815.593875,2444.223986,2781.18844,120.0


In [75]:
# normalization

normalized_test_features = scaler.transform(encoded_test_set)

In [79]:
# lasso regressor prediction
lasso_y_pred = lasso.predict(normalized_test_features).reshape(-1, 1)
lasso_y_pred = y_scaler.inverse_transform(lasso_y_pred)

In [80]:
lasso_y_pred[:20]

array([[3101.27459237],
       [2640.55788636],
       [3406.67241448],
       [2096.86286442],
       [2525.37926205],
       [2212.61720358],
       [2344.57089741],
       [3228.71983629],
       [2580.21397766],
       [3864.95093603],
       [2045.87665011],
       [2922.53073975],
       [2778.00866807],
       [2683.18690409],
       [2369.39367549],
       [2744.77009852],
       [2027.26268721],
       [2732.75795692],
       [2475.0454785 ],
       [2768.96585691]])

In [81]:
df = pd.DataFrame(lasso_y_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_lasso.csv", index_label="Id", header=True, float_format='%.4f')

In [88]:
# Gradient Boosting Regressor prediction
gbr_y_pred = gbr.predict(normalized_test_features).reshape(-1, 1)
gbr_y_pred = y_scaler.inverse_transform(gbr_y_pred)

In [91]:
gbr_y_pred[:20]

array([[2887.52342067],
       [2429.15365717],
       [3412.59216359],
       [2085.2251492 ],
       [2377.32138225],
       [2237.4295433 ],
       [2260.30010829],
       [3481.88231353],
       [2517.24601277],
       [4247.08404924],
       [2041.78764805],
       [2704.26777254],
       [2958.63332327],
       [2711.11517395],
       [2322.25772255],
       [2903.85685426],
       [2094.22310081],
       [2622.45523576],
       [2406.57745076],
       [2541.84882339]])

In [92]:
df = pd.DataFrame(gbr_y_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_gbr.csv", index_label="Id", header=True, float_format='%.4f')

### XGBoost Regression

In [31]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [32]:
X, y = normalized_features, target_variable

In [52]:
# Split the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [66]:
# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

#Printing basic information about the DMatrix
print("Number of training samples in DMatrix:", dtrain_reg.num_row())
print("Number of features in Dmatrix:", dtrain_reg.num_col())

Number of training samples in DMatrix: 40200
Number of features in Dmatrix: 13


In [67]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",
}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [68]:
feature_score = model.get_score(importance_type="gain")
print(feature_score)

{'f0': 1.4989417791366577, 'f1': 1.351144552230835, 'f2': 1.381648063659668, 'f3': 1.4058855772018433, 'f4': 19.64893341064453, 'f5': 20.662702560424805, 'f6': 4.909601211547852, 'f7': 1.033408761024475, 'f8': 2.549201011657715, 'f9': 5.570499420166016, 'f10': 2.647329568862915, 'f11': 6.305320739746094, 'f12': 2.9813573360443115}


In [69]:
preds = model.predict(dtest_reg)

In [70]:
# compute and print accuracy score

rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")
print(f"R2 score of the base model: {r2:.3f}")

RMSE of the base model: 0.650
R2 score of the base model: 0.578


#### Early stopping
We can test our model at each step and see if adding a new tree/round improves performance. To do so, we define a test dataset and a metric that is used to assess performance at each round. If performance haven’t improved for N rounds (N is defined by the variable early_stopping_round), we stop the training and keep the best number of boosting rounds.

In [40]:
params = {
    "objective": "reg:squarederror",
}
n = 100

# evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]
evals = [(dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=2, # Every ten rounds
   early_stopping_rounds=10
)

[0]	validation-rmse:0.91085
[2]	validation-rmse:0.71653
[4]	validation-rmse:0.65871
[6]	validation-rmse:0.64242
[8]	validation-rmse:0.63760
[10]	validation-rmse:0.63618
[12]	validation-rmse:0.63610
[14]	validation-rmse:0.63606
[16]	validation-rmse:0.63627
[18]	validation-rmse:0.63661
[20]	validation-rmse:0.63651


#### XGBoost Cross Validation

In [41]:
params = {
    "objective": "reg:squarederror",
}
n = 100

results = xgb.cv(
   params, 
   dtrain_reg,
   num_boost_round=n,
   seed=42,
   nfold=5,
   metrics={'rmse'},
   early_stopping_rounds=20
)

In [42]:
best_rmse = results['test-rmse-mean'].min()
print(best_rmse)

0.6347802


In [61]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.862514,0.000357,0.864458,0.002161
1,0.782984,0.000722,0.78735,0.00066
2,0.737122,0.001128,0.743983,0.001622
3,0.711005,0.001167,0.720058,0.002688
4,0.695496,0.001333,0.706681,0.003354


#### Optimize XGBoost Model using GridSearch
GridSearch is a method used for hyperparameter tuning where one specifies a subset of possible values for each hyperparameter of interest. The method then exhaustively tries out all possible combinations of these hyperparameters to find the combination that produces the best model performance, according to some metric.

##### max_depth & min_child_weight

In [43]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]
num_boost_round = 100

In [44]:
# Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))


CV with max_depth=9, min_child_weight=5
	RMSE 0.645585 for 8 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 0.6450234 for 8 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 0.644642 for 8 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 0.653372 for 7 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 0.6522718000000001 for 7 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 0.6530592000000001 for 8 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 0.6597664 for 7 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 0.6590596 for 7 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 0.6595968000000001 for 7 rounds
Best params: 9, 7, RMSE: 0.644642


In [45]:
params['max_depth'] = 9
params['min_child_weight'] = 7

##### gamma

In [46]:
min_rmse = float("Inf")
best_params = None
for gamma in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
    print("CV with gamma={}".format(gamma))
    # We update our parameters
    params['gamma'] = gamma
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['rmse'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = gamma
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

CV with gamma=0.1
	RMSE 0.6455752 for 8 rounds

CV with gamma=0.2
	RMSE 0.645783 for 8 rounds

CV with gamma=0.3
	RMSE 0.6449376 for 8 rounds

CV with gamma=0.4
	RMSE 0.6443828 for 8 rounds

CV with gamma=0.5
	RMSE 0.6445616 for 9 rounds

CV with gamma=0.6
	RMSE 0.6445616 for 8 rounds

Best params: 0.4, rmse: 0.6443828


In [47]:
params['gamma'] = 0.4

##### subsample & colsample

In [48]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [49]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, rmse: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	RMSE 0.6443828 for 8 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 0.646458 for 8 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 0.6473368 for 10 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 0.647791 for 9 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 0.6451836 for 9 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 0.6469123999999999 for 9 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 0.6465626 for 9 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 0.6475848 for 9 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 0.6473587999999999 for 7 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 0.649131 for 8 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 0.6490598000000001 for 9 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 0.6507426 for 9 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 0.6483893999999999 for 8 rounds
CV with subsample=0.7, colsample=0.9
	RMSE 0.6497375999999999 for 9 rounds
CV with subsample=0.7, colsample=

In [53]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

##### alpha & lumbda

In [50]:
gridsearch_params = {
    'reg_alpha': [0.05, 0.1, 1, 2, 3]
}

In [51]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for reg_alpha in gridsearch_params['reg_alpha']:
        print("CV with reg_alpha={}".format(
                                 reg_alpha))
        # We update our parameters
        params['reg_alpha'] = reg_alpha
        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain_reg,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics={'rmse'},
            early_stopping_rounds=10
        )
        # Update best score
        mean_rmse = cv_results['test-rmse-mean'].min()
        boost_rounds = cv_results['test-rmse-mean'].argmin()
        print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
        if mean_rmse < min_rmse:
            min_rmse = mean_rmse
            best_params = reg_alpha
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

CV with reg_alpha=0.05
	RMSE 0.650727 for 9 rounds
CV with reg_alpha=0.1
	RMSE 0.6508632 for 9 rounds
CV with reg_alpha=1
	RMSE 0.6483702 for 10 rounds
CV with reg_alpha=2
	RMSE 0.6458564 for 10 rounds
CV with reg_alpha=3
	RMSE 0.6450126 for 12 rounds
Best params: 3, rmse: 0.6450126


In [52]:
params['alpha'] = 3

In [54]:
%time

min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time 
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['rmse'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

Wall time: 0 ns
CV with eta=0.3
Wall time: 0 ns
	RMSE 0.6419296 for 10 rounds

CV with eta=0.2
Wall time: 0 ns
	RMSE 0.6400288 for 16 rounds

CV with eta=0.1
Wall time: 0 ns
	RMSE 0.6391744 for 36 rounds

CV with eta=0.05
Wall time: 0 ns
	RMSE 0.6387166000000001 for 72 rounds

CV with eta=0.01
Wall time: 0 ns
	RMSE 0.7239755999999999 for 99 rounds

CV with eta=0.005
Wall time: 0 ns
	RMSE 0.8496246 for 99 rounds

Best params: 0.05, rmse: 0.6387166000000001


In [55]:
params['eta'] = .05

In [56]:
params

{'objective': 'reg:squarederror',
 'max_depth': 9,
 'min_child_weight': 7,
 'gamma': 0.4,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'reg_alpha': 3,
 'alpha': 3,
 'eta': 0.05}

In [57]:
model = xgb.train(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:1.08191
[1]	Test-rmse:1.04747
[2]	Test-rmse:1.01538
[3]	Test-rmse:0.98545
[4]	Test-rmse:0.95761
[5]	Test-rmse:0.93169
[6]	Test-rmse:0.90773
[7]	Test-rmse:0.88551
[8]	Test-rmse:0.86491
[9]	Test-rmse:0.84596
[10]	Test-rmse:0.82832
[11]	Test-rmse:0.81218
[12]	Test-rmse:0.79711
[13]	Test-rmse:0.78325
[14]	Test-rmse:0.77053
[15]	Test-rmse:0.75886
[16]	Test-rmse:0.74813
[17]	Test-rmse:0.73820
[18]	Test-rmse:0.72923
[19]	Test-rmse:0.72099
[20]	Test-rmse:0.71340
[21]	Test-rmse:0.70649
[22]	Test-rmse:0.70014
[23]	Test-rmse:0.69427
[24]	Test-rmse:0.68909
[25]	Test-rmse:0.68429
[26]	Test-rmse:0.67997
[27]	Test-rmse:0.67603
[28]	Test-rmse:0.67257
[29]	Test-rmse:0.66931
[30]	Test-rmse:0.66641
[31]	Test-rmse:0.66366
[32]	Test-rmse:0.66120
[33]	Test-rmse:0.65900
[34]	Test-rmse:0.65697
[35]	Test-rmse:0.65511
[36]	Test-rmse:0.65347
[37]	Test-rmse:0.65195
[38]	Test-rmse:0.65061
[39]	Test-rmse:0.64946
[40]	Test-rmse:0.64842
[41]	Test-rmse:0.64742
[42]	Test-rmse:0.64651
[43]	Test-rmse:0.6457

In [58]:
print("Best RMSE: {:.3f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best RMSE: 0.639 in 69 rounds


#### Saving the best model

In [93]:
dtrain = xgb.DMatrix(normalized_features, target_variable, enable_categorical=True)
dtest = xgb.DMatrix(normalized_test_features, enable_categorical=True)

In [94]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")]
)

[0]	Test-rmse:0.90956
[1]	Test-rmse:0.78467
[2]	Test-rmse:0.71324
[3]	Test-rmse:0.67310
[4]	Test-rmse:0.65150
[5]	Test-rmse:0.63907
[6]	Test-rmse:0.63199
[7]	Test-rmse:0.62756
[8]	Test-rmse:0.62443
[9]	Test-rmse:0.62265
[10]	Test-rmse:0.62048
[11]	Test-rmse:0.61899
[12]	Test-rmse:0.61820
[13]	Test-rmse:0.61678
[14]	Test-rmse:0.61603
[15]	Test-rmse:0.61496
[16]	Test-rmse:0.61440
[17]	Test-rmse:0.61358
[18]	Test-rmse:0.61299
[19]	Test-rmse:0.61223
[20]	Test-rmse:0.61190
[21]	Test-rmse:0.61139
[22]	Test-rmse:0.61063
[23]	Test-rmse:0.61038
[24]	Test-rmse:0.60979
[25]	Test-rmse:0.60888
[26]	Test-rmse:0.60816
[27]	Test-rmse:0.60793
[28]	Test-rmse:0.60772
[29]	Test-rmse:0.60714
[30]	Test-rmse:0.60658
[31]	Test-rmse:0.60599
[32]	Test-rmse:0.60531
[33]	Test-rmse:0.60485
[34]	Test-rmse:0.60403
[35]	Test-rmse:0.60365
[36]	Test-rmse:0.60326
[37]	Test-rmse:0.60275
[38]	Test-rmse:0.60227
[39]	Test-rmse:0.60185
[40]	Test-rmse:0.60114
[41]	Test-rmse:0.60048
[42]	Test-rmse:0.59976
[43]	Test-rmse:0.5993

In [95]:
submission_test_pred = best_model.predict(dtest).reshape(-1, 1)
submission_test_pred = y_scaler.inverse_transform(submission_test_pred)

In [96]:
df = pd.DataFrame(submission_test_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_xgb.csv", index_label="Id", header=True, float_format='%.4f')