In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import TargetEncoder

In [2]:
train_set = pd.read_csv('Datasets/train.csv')

train_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


### data cleaning

In [3]:
# get copy

train_set_cleaned = train_set.copy()

In [4]:
# add two new attributes, split rent_approval_date into month, year

train_set_cleaned[['rent_approval_year','rent_approval_month']] = train_set_cleaned['rent_approval_date'].str.split('-',expand=True)

# train_set_cleaned['rent_approval_month'] = pd.to_datetime(train_set_cleaned['rent_approval_month'], format='%m').dt.month
# train_set_cleaned['rent_approval_year'] = pd.to_datetime(train_set_cleaned['rent_approval_year'], format='%y').dt.year

In [5]:
# street name to lower case

train_set_cleaned['street_name'] = train_set_cleaned['street_name'].apply(str.lower)

In [6]:
# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')

train_set_cleaned['flat_type'] = train_set_cleaned['flat_type'].apply(lambda x: x.replace(' ', '-'))

In [7]:
def date_filter_condition(x):
    
    if x < 1970:
        return 'before 70s'
    elif x >= 1970 and x < 1980:
        return '70s'
    elif x >= 1980 and x < 1990:
        return '80s'
    elif x >= 1990 and x < 2000:
        return '90s'
    elif x >= 2000 and x < 2010:
        return '00s'
    elif x >= 2010 and x < 2020:
        return '10s'
    else:
        return 'others'

# categorize lease commence date by decades    
train_set_cleaned['lease_date_cat'] = train_set_cleaned['lease_commence_date'].apply(date_filter_condition)

In [8]:
train_set_cleaned.head(10)

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,lease_date_cat
0,2021-09,jurong east,257,jurong east street 24,3-room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600,2021,9,80s
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,2022,5,70s
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,2022,10,70s
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,2021,8,90s
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,2022,11,70s
5,2023-04,bukit panjang,654,senja road,executive,premium apartment,130.0,yes,2001,1.387847,103.764249,0.0,saujana,bukit panjang,west region,2300,2023,4,00s
6,2021-01,sengkang,407b,fernvale road,5-room,premium apartment,110.0,yes,2005,1.388997,103.875148,0.0,fernvale,sengkang,north-east region,2100,2021,1,00s
7,2022-06,ang mo kio,223,ang mo kio avenue 1,3-room,new generation,67.0,yes,1978,1.366048,103.838123,0.0,shangri-la,ang mo kio,north-east region,2300,2022,6,70s
8,2021-10,bishan,149,bishan street 11,4-room,simplified,84.0,yes,1987,1.344279,103.855556,0.0,bishan east,bishan,central region,2100,2021,10,80s
9,2021-04,punggol,133,edgedale plains,5-room,premium apartment,112.0,yes,2003,1.392832,103.91062,0.0,punggol field,punggol,north-east region,2100,2021,4,00s


In [9]:
train_set_cleaned['rent_approval_year'].value_counts()

2021    24909
2022    21399
2023    13692
Name: rent_approval_year, dtype: int64

### auxiliary data

In [10]:
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    
    return distance


In [11]:
def calculate_distance_to_facilities(data):
    existing_mrt_df = pd.read_csv('Datasets/auxiliary-data/sg-mrt-existing-stations.csv')
    shopping_malls_df = pd.read_csv('Datasets/auxiliary-data/sg-shopping-malls.csv')
    primary_schools_df = pd.read_csv('Datasets/auxiliary-data/sg-primary-schools.csv')

    # Calculate distances to existing MRT stations
    property_latitudes = data['latitude'].values
    property_longitudes = data['longitude'].values
    existing_mrt_latitudes = existing_mrt_df['latitude'].values
    existing_mrt_longitudes = existing_mrt_df['longitude'].values
    shopping_malls_latitudes = shopping_malls_df['latitude'].values
    shopping_malls_longitudes = shopping_malls_df['longitude'].values
    primary_schools_latitudes = primary_schools_df['latitude'].values
    primary_schools_longitudes = primary_schools_df['longitude'].values
    # planned_mrt_latitudes = planned_mrt_df['latitude'].values
    # planned_mrt_longitudes = planned_mrt_df['longitude'].values

    # Calculate distances
    distances_to_existing_mrt = haversine(
        property_latitudes[:, np.newaxis], property_longitudes[:, np.newaxis],
        existing_mrt_latitudes, existing_mrt_longitudes
    )

    distances_to_shopping_malls = haversine(
        property_latitudes[:, np.newaxis], property_longitudes[:, np.newaxis],
        shopping_malls_latitudes, shopping_malls_longitudes
    )

    distances_to_primary_schools = haversine(
        property_latitudes[:, np.newaxis], property_longitudes[:, np.newaxis],
        primary_schools_latitudes, primary_schools_longitudes
    )

    # distances_to_planned_mrt = haversine(
    #     property_latitudes[:, np.newaxis], property_longitudes[:, np.newaxis],
    #     planned_mrt_latitudes, planned_mrt_longitudes
    # )

    # Find the minimum distances
    min_distances_to_existing_mrt = np.min(distances_to_existing_mrt, axis=1)
    min_distances_to_shopping_mall = np.min(distances_to_shopping_malls, axis=1)
    min_distances_to_primary_school = np.min(distances_to_primary_schools, axis=1)

    # Add the minimum distance as new features 'distance_to_nearest_existing_mrt', 'distance_to_nearest_shopping_mall', 'distance_to_nearest_primary_school'
    data['distance_to_nearest_existing_mrt'] = min_distances_to_existing_mrt
    data['distance_to_nearest_shopping_mall'] = min_distances_to_shopping_mall
    data['distance_to_nearest_primary_school'] = min_distances_to_primary_school
    
    data.drop(columns=['latitude', 'longitude'], inplace=True)

    data.head()
    return data

In [12]:
train_set_cleaned = calculate_distance_to_facilities(train_set_cleaned)
train_set_cleaned.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,elevation,subzone,planning_area,region,monthly_rent,rent_approval_year,rent_approval_month,lease_date_cat,distance_to_nearest_existing_mrt,distance_to_nearest_shopping_mall,distance_to_nearest_primary_school
0,2021-09,jurong east,257,jurong east street 24,3-room,new generation,67.0,yes,1983,0.0,yuhua east,jurong east,west region,1600,2021,9,80s,0.699127,1.202674,0.334846
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,0.0,bedok north,bedok,east region,2250,2022,5,70s,0.898991,1.114338,0.607716
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,0.0,toa payoh central,toa payoh,central region,1900,2022,10,70s,0.218603,0.468297,0.42576
3,2021-08,pasir ris,250,pasir ris street 21,executive,apartment,149.0,yes,1993,0.0,pasir ris drive,pasir ris,east region,2850,2021,8,90s,1.54604,0.402359,0.564969
4,2022-11,kallang/whampoa,34,whampoa west,3-room,improved,68.0,yes,1972,0.0,bendemeer,kallang,central region,2100,2022,11,70s,0.187856,1.073354,0.271723


### encoding

In [13]:
# create a new dataframe
encoded_train_set = train_set_cleaned[['monthly_rent']].copy()

#### spatial information

In [14]:
# encode regions with the mean of target variable for each region

region_encoder = TargetEncoder()
encoded_train_set['region_encoded'] = region_encoder.fit_transform(train_set_cleaned['region'], train_set_cleaned['monthly_rent'])

In [15]:
# encode planning areas with target variable information

planning_area_encoder = TargetEncoder()
encoded_train_set['planning_area_encoded'] = planning_area_encoder.fit_transform(train_set_cleaned['planning_area'], train_set_cleaned['monthly_rent'])

In [16]:
# encode subzones with target variable information

subzone_encoder = TargetEncoder()
encoded_train_set['subzone_encoded'] = subzone_encoder.fit_transform(train_set_cleaned['subzone'], train_set_cleaned['monthly_rent'])

In [17]:
# encode streets with target variable information

street_encoder = TargetEncoder()
encoded_train_set['street_encoded'] = street_encoder.fit_transform(train_set_cleaned['street_name'], train_set_cleaned['monthly_rent'])

In [18]:
## encode block (with street name) with target variable information

block_encoder = TargetEncoder()
encoded_train_set['block_encoded'] = block_encoder.fit_transform(train_set_cleaned['street_name']+train_set_cleaned['block'], train_set_cleaned['monthly_rent'])

In [19]:
# spatial hierarchical correlation

region_planning_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['planning_area_encoded'])
planning_subzone_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['subzone_encoded'])
subzone_street_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['street_encoded'])

print(f'The correlation between region and planning area is {region_planning_corr:3f}')
print(f'The correlation between planning area and subzone is {planning_subzone_corr:3f}')
print(f'The correlation between subzone and street is {subzone_street_corr:3f}')

The correlation between region and planning area is 0.579100
The correlation between planning area and subzone is 0.669665
The correlation between subzone and street is 0.828036


In [20]:
# spatial information correlation with monthly rent

region_rent_corr = encoded_train_set['region_encoded'].corr(encoded_train_set['monthly_rent'])
planning_rent_corr = encoded_train_set['planning_area_encoded'].corr(encoded_train_set['monthly_rent'])
subzone_rent_corr = encoded_train_set['subzone_encoded'].corr(encoded_train_set['monthly_rent'])
street_rent_corr = encoded_train_set['street_encoded'].corr(encoded_train_set['monthly_rent'])
block_rent_corr = encoded_train_set['block_encoded'].corr(encoded_train_set['monthly_rent'])

print(f'Region and Monthly Rental correlation is {region_rent_corr:3f}')
print(f'Planning Area and Monthly Rental correlation is {planning_rent_corr:3f}')
print(f'Subzone and Monthly Rental correlation is {subzone_rent_corr:3f}')
print(f'Street and Monthly Rental correlation is {street_rent_corr:3f}')
print(f'Block and Monthly Rental correlation is {block_rent_corr:3f}')

Region and Monthly Rental correlation is 0.124776
Planning Area and Monthly Rental correlation is 0.215462
Subzone and Monthly Rental correlation is 0.320711
Street and Monthly Rental correlation is 0.374731
Block and Monthly Rental correlation is 0.521544


#### rent date & lease date

In [21]:
# rental approval date

rental_encoder = TargetEncoder()
encoded_train_set['rental_approval_date_encoded'] = rental_encoder.fit_transform(train_set_cleaned['rent_approval_date'].astype(str), train_set_cleaned['monthly_rent'])

rental_date_rent_corr = encoded_train_set['rental_approval_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Date and Monthly Rental correlation is {rental_date_rent_corr:3f}')

Rental Approval Date and Monthly Rental correlation is 0.546074


In [22]:
# rental approval year

rental_year_encoder = TargetEncoder()
encoded_train_set['rental_approval_year_encoded'] = rental_year_encoder.fit_transform(train_set_cleaned['rent_approval_year'].astype(str), train_set_cleaned['monthly_rent'])

rental_year_rent_corr = encoded_train_set['rental_approval_year_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Year and Monthly Rental correlation is {rental_year_rent_corr:3f}')

Rental Approval Year and Monthly Rental correlation is 0.504737


In [23]:
# rental approval month

rental_month_encoder = TargetEncoder()
encoded_train_set['rental_approval_month_encoded'] = rental_month_encoder.fit_transform(train_set_cleaned['rent_approval_month'].astype(str), train_set_cleaned['monthly_rent'])

rental_month_rent_corr = encoded_train_set['rental_approval_month_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Rental Approval Month and Monthly Rental correlation is {rental_month_rent_corr:3f}')

Rental Approval Month and Monthly Rental correlation is 0.106482


In [24]:
# lease commence year (in decade)

lease_encoder = TargetEncoder()
encoded_train_set['lease_commence_date_encoded'] = lease_encoder.fit_transform(train_set_cleaned['lease_date_cat'], train_set_cleaned['monthly_rent'])

lease_date_rent_corr = encoded_train_set['lease_commence_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Lease Commence Date and Monthly Rental correlation is {lease_date_rent_corr:3f}')

Lease Commence Date and Monthly Rental correlation is 0.223300


In [25]:
## rental date + type (NOT good for GBR)

type_by_date = TargetEncoder()
encoded_train_set['type_by_date_encoded'] = type_by_date.fit_transform(train_set_cleaned['rent_approval_date']+train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

type_by_date_rent_corr = encoded_train_set['type_by_date_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type by Rental Date and Monthly Rental correlation is {type_by_date_rent_corr:3f}')

Flat Type by Rental Date and Monthly Rental correlation is 0.655330


#### flat_type and flat_model

In [26]:
train_set_cleaned['flat_type'].value_counts()

4-room       21889
3-room       18897
5-room       14759
executive     3528
2-room         927
Name: flat_type, dtype: int64

In [27]:
# flat type

flat_type_encoder = TargetEncoder()
encoded_train_set['flat_type_encoded'] = flat_type_encoder.fit_transform(train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

flat_type_rent_corr = encoded_train_set['flat_type_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type and Monthly Rental correlation is {flat_type_rent_corr:3f}')

Flat Type and Monthly Rental correlation is 0.346146


In [28]:
# flat model

flat_model_encoder = TargetEncoder()
encoded_train_set['flat_model_encoded'] = flat_model_encoder.fit_transform(train_set_cleaned['flat_model'], train_set_cleaned['monthly_rent'])

flat_model_rent_corr = encoded_train_set['flat_model_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Model and Monthly Rental correlation is {flat_model_rent_corr:3f}')

Flat Model and Monthly Rental correlation is 0.236876


In [29]:
## street + type

type_by_street = TargetEncoder()
encoded_train_set['type_by_street_encoded'] = type_by_street.fit_transform(train_set_cleaned['street_name']+train_set_cleaned['flat_type'], train_set_cleaned['monthly_rent'])

type_by_street_rent_corr = encoded_train_set['type_by_street_encoded'].corr(encoded_train_set['monthly_rent'])
print(f'Flat Type by Street and Monthly Rental correlation is {type_by_street_rent_corr:3f}')

Flat Type by Street and Monthly Rental correlation is 0.481737


#### floor area

In [30]:
# floor area sqm
encoded_train_set['floor_area_sqm'] = train_set_cleaned['floor_area_sqm'].copy()

floor_area_sqm_corr = encoded_train_set['floor_area_sqm'].corr(encoded_train_set['monthly_rent'])
print(f'Floor Area in Sqm and Monthly Rental correlation is {floor_area_sqm_corr:3f}')

Floor Area in Sqm and Monthly Rental correlation is 0.306466


#### Distance to facilities

In [31]:
# distance to nearest mrt
encoded_train_set['distance_to_nearest_existing_mrt'] = train_set_cleaned['distance_to_nearest_existing_mrt'].copy()

distance_to_nearest_mrt_rent_corr = encoded_train_set['distance_to_nearest_existing_mrt'].corr(encoded_train_set['monthly_rent'])
print(f'Distance to nearest mrt and Monthly Rental correlation is {distance_to_nearest_mrt_rent_corr:3f}')

Distance to nearest mrt and Monthly Rental correlation is -0.067058


In [32]:
# distance to nearest shopping mall
encoded_train_set['distance_to_nearest_shopping_mall'] = train_set_cleaned['distance_to_nearest_shopping_mall'].copy()

distance_to_nearest_shopping_mall_rent_corr = encoded_train_set['distance_to_nearest_shopping_mall'].corr(encoded_train_set['monthly_rent'])
print(f'Distance to nearest shopping mall and Monthly Rental correlation is {distance_to_nearest_shopping_mall_rent_corr:3f}')

Distance to nearest shopping mall and Monthly Rental correlation is -0.071104


In [33]:
# distance to nearest primary school
encoded_train_set['distance_to_nearest_primary_school'] = train_set_cleaned['distance_to_nearest_primary_school'].copy()

distance_to_nearest_primary_school_rent_corr = encoded_train_set['distance_to_nearest_primary_school'].corr(encoded_train_set['monthly_rent'])
print(f'Distance to nearest primary school and Monthly Rental correlation is {distance_to_nearest_primary_school_rent_corr:3f}')

Distance to nearest primary school and Monthly Rental correlation is 0.001750


### normalization

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
encoded_train_set

Unnamed: 0,monthly_rent,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,block_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,type_by_date_encoded,flat_type_encoded,flat_model_encoded,type_by_street_encoded,floor_area_sqm,distance_to_nearest_existing_mrt,distance_to_nearest_shopping_mall,distance_to_nearest_primary_school
0,1600,2569.167537,2595.146199,2542.158516,2312.179832,2319.910527,2233.926780,2225.773817,2489.108495,2479.803864,1971.702454,2276.033233,2369.965462,2237.215820,67.0,0.699127,1.202674,0.334846
1,2250,2570.667785,2438.227223,2360.371046,2404.212860,2596.971112,2517.128874,2651.014066,2618.130520,2421.705462,2611.061286,2692.359176,2369.965462,2591.822430,92.0,0.898991,1.114338,0.607716
2,1900,2737.201353,2516.680515,2808.893871,2403.464419,2317.549823,2928.483245,2651.014066,2563.328013,2421.705462,2523.925781,2276.033233,2636.211052,2301.216814,67.0,0.218603,0.468297,0.425760
3,2850,2570.667785,2686.857477,2610.338573,2757.834101,2678.043573,2249.901768,2225.773817,2470.895522,2700.899570,2500.002036,2892.857143,2878.725962,2840.416063,149.0,1.546040,0.402359,0.564969
4,2100,2737.201353,2702.635659,2793.525180,2407.998266,2407.998266,2986.739659,2651.014066,2611.993243,2421.705462,2615.208748,2276.033233,2636.211052,2407.998266,68.0,0.187856,1.073354,0.271723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2200,2558.822710,2416.700057,2390.887097,2336.012658,2549.933369,2233.926780,2225.773817,2489.108495,2421.705462,1971.702454,2276.033233,2369.965462,2206.830123,67.0,0.670095,0.949719,0.360651
59996,4100,2737.201353,2904.113924,2694.936709,2763.731680,2784.903851,3178.128128,3158.694858,2608.904470,2880.707364,3309.905020,2692.359176,2612.031305,2745.330132,83.0,0.619569,0.740674,0.562691
59997,2250,2570.667785,2638.489123,2602.823315,2509.602223,2651.586378,2582.606383,2651.014066,2658.595055,2479.803864,2801.307190,2815.593875,2636.211052,2541.557159,122.0,0.722926,0.439795,0.320031
59998,4700,2570.667785,2438.227223,2434.379786,2591.443246,2766.882904,3069.581639,3158.694858,2490.553580,2421.705462,3427.662037,2815.593875,2444.223986,2743.101626,123.0,0.923539,1.775198,0.219549


In [36]:
# initialize the feature scaler
scaler = StandardScaler()

normalized_features = scaler.fit_transform(encoded_train_set.iloc[:,1:])

In [37]:
df_describe = pd.DataFrame(normalized_features)
df_describe.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,3.504271e-15,3.138378e-15,1.884108e-15,-1.100081e-15,2.279641e-15,8.51208e-16,-2.717411e-15,-7.782885e-16,1.624649e-15,9.48446e-16,6.510163e-16,1.377269e-16,8.100779e-16,4.176437e-16,3.547949e-16,-3.962256e-16,-3.192853e-16
std,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008
min,-1.566138,-1.248074,-1.997123,-2.29649,-4.02188,-1.136227,-1.010297,-1.568908,-2.255897,-1.887184,-2.991037,-3.995729,-3.135778,-2.511392,-1.635873,-1.753767,-1.538429
25%,-0.3531893,-0.8052347,-0.6691684,-0.678632,-0.5535625,-0.8720157,-1.010297,-0.6184322,-0.692344,-0.6784907,-1.270077,-0.7507854,-0.6702463,-0.8919552,-0.7185363,-0.7637929,-0.7153713
50%,-0.2372201,0.03131269,-0.01654897,-0.1316916,-0.05096325,-0.3898313,0.1681796,0.2440221,-0.692344,-0.2476472,0.41231,0.1323593,0.004972374,-0.0614746,-0.1905539,-0.1982222,-0.1739076
75%,-0.2204018,0.3667341,0.2406733,0.4403614,0.4530094,1.227626,0.1681796,0.8967736,0.9533255,0.6145215,0.9103054,0.2772886,0.4817526,0.6444339,0.466845,0.5787523,0.4537277
max,1.646499,6.29796,6.454196,6.316627,9.372359,1.72115,1.575125,1.82876,1.818984,2.589183,1.222529,10.3054,5.686196,5.004457,4.291434,4.325789,6.629788


In [38]:
# normalize target variable
y_scaler = StandardScaler()

target_variable = y_scaler.fit_transform(encoded_train_set['monthly_rent'].to_numpy().reshape(-1, 1))
target_variable = target_variable.reshape(-1)

In [39]:
df_describe = pd.DataFrame(target_variable)
df_describe.describe()

Unnamed: 0,0
count,60000.0
mean,-2.100866e-16
std,1.000008
min,-3.203684
25%,-0.6858655
50%,-0.266229
75%,0.5730439
max,6.098257


### training

In [40]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [41]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(normalized_features, target_variable, test_size=0.33, random_state=42)

#### lasso regression model

In [42]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Lasso Regression with cross-validation
lasso = LassoCV(cv=kfold, random_state=42)
lasso.fit(X_train, y_train)

LassoCV(cv=KFold(n_splits=5, random_state=42, shuffle=True), random_state=42)

In [43]:
# evaluate the model

print("Optimal alpha:", lasso.alpha_)
print("Mean cross-validated score of the best estimator:", lasso.score(X_test, y_test))

Optimal alpha: 0.0006562409407921091
Mean cross-validated score of the best estimator: 0.5605662277876811


In [44]:
# making predictions 
y_pred = lasso.predict(X_test)

# evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Lasso Regression Mean Squared Error: {mse}")
print(f"Model Coefficients: \n{lasso.coef_}")

Lasso Regression Mean Squared Error: 0.4406857838889138
Model Coefficients: 
[ 0.07204814 -0.00120212 -0.02441907 -0.00312259  0.31457862  0.16104589
  0.01163763 -0.0019843  -0.00804206  0.4108467  -0.12786654 -0.01586773
  0.14141673  0.06044962 -0.0218949  -0.01064439  0.01445393]


#### graident boosting tree

In [45]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# implement gradient boosting tree with cross validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(gbr, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')

gbr.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [46]:
mean_mse = np.mean(-cross_val_scores)
print(f"Gradient Boosting Regressor Mean Squared Error: {mean_mse}")

Gradient Boosting Regressor Mean Squared Error: 0.4017942876955939


In [47]:
print("Mean cross-validated score of the best regressor:", gbr.score(X_test, y_test))

Mean cross-validated score of the best regressor: 0.5964691620654212


### test set evaluation

In [48]:
test_set = pd.read_csv('Datasets/test.csv')

test_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [49]:
(test_set.isna().sum(axis=1) > 0).sum()

0

In [50]:
# data preparation

test_set[['rent_approval_year','rent_approval_month']] = test_set['rent_approval_date'].str.split('-',expand=True)

# test_set['rent_approval_date'] = test_set['rent_approval_date'].astype('Period[M]')

# street name to lower case
test_set['street_name'] = test_set['street_name'].apply(str.lower)

# replace blank space with hyphen in flat_type (e.g. '2 room' to '2-room')
test_set['flat_type'] = test_set['flat_type'].apply(lambda x: x.replace(' ', '-'))

# categorize lease commence date by decades (remember to date_filter_condition function above first)
test_set['lease_date_cat'] = test_set['lease_commence_date'].apply(date_filter_condition)

In [51]:
# merge auxiliary data
test_set = calculate_distance_to_facilities(test_set)
test_set.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,elevation,subzone,planning_area,region,rent_approval_year,rent_approval_month,lease_date_cat,distance_to_nearest_existing_mrt,distance_to_nearest_shopping_mall,distance_to_nearest_primary_school
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,0.0,lorong ah soo,hougang,north-east region,2023,1,80s,0.820333,0.739403,0.149667
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,0.0,sembawang central,sembawang,north region,2022,9,90s,0.307784,0.25492,0.143325
2,2023-07,clementi,708,clementi west street 2,4-room,new generation,91.0,yes,1980,0.0,clementi west,clementi,west region,2023,7,80s,1.097689,0.49767,1.040844
3,2021-08,jurong east,351,jurong east street 31,3-room,model a,74.0,yes,1986,0.0,yuhua west,jurong east,west region,2021,8,80s,0.36281,1.596397,1.075525
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,0.0,yuhua west,jurong east,west region,2022,3,80s,0.44898,1.461487,0.714777


In [52]:
# encoding test set

encoded_test_set = pd.DataFrame(index=test_set.index)

encoded_test_set['region_encoded'] = region_encoder.transform(test_set['region'])
encoded_test_set['planning_area_encoded'] = planning_area_encoder.transform(test_set['planning_area'])
encoded_test_set['subzone_encoded'] = subzone_encoder.transform(test_set['subzone'])
encoded_test_set['street_encoded'] = street_encoder.transform(test_set['street_name'])
encoded_test_set['block_encoded'] = block_encoder.transform(test_set['street_name']+test_set['block'])

encoded_test_set['rental_approval_date_encoded'] = rental_encoder.transform(test_set['rent_approval_date'])
encoded_test_set['rental_approval_year_encoded'] = rental_year_encoder.transform(test_set['rent_approval_year'])
encoded_test_set['rental_approval_month_encoded'] = rental_month_encoder.transform(test_set['rent_approval_month'])

encoded_test_set['lease_commence_date_encoded'] = lease_encoder.transform(test_set['lease_date_cat'])
encoded_test_set['type_by_date_encoded'] = type_by_date.transform(test_set['rent_approval_date']+test_set['flat_type'])

encoded_test_set['flat_type_encoded'] = flat_type_encoder.transform(test_set['flat_type'])
encoded_test_set['flat_model_encoded'] = flat_model_encoder.transform(test_set['flat_model'])
encoded_test_set['type_by_street_encoded'] = type_by_street.transform(test_set['street_name']+test_set['flat_type'])


encoded_test_set['floor_area_sqm'] = test_set['floor_area_sqm'].copy()

encoded_test_set['distance_to_nearest_existing_mrt'] = test_set['distance_to_nearest_existing_mrt'].copy()
encoded_test_set['distance_to_nearest_shopping_mall'] = test_set['distance_to_nearest_shopping_mall'].copy()
encoded_test_set['distance_to_nearest_primary_school'] = test_set['distance_to_nearest_primary_school'].copy()


In [53]:
encoded_test_set.head(10)

Unnamed: 0,region_encoded,planning_area_encoded,subzone_encoded,street_encoded,block_encoded,rental_approval_date_encoded,rental_approval_year_encoded,rental_approval_month_encoded,lease_commence_date_encoded,type_by_date_encoded,flat_type_encoded,flat_model_encoded,type_by_street_encoded,floor_area_sqm,distance_to_nearest_existing_mrt,distance_to_nearest_shopping_mall,distance_to_nearest_primary_school
0,2558.82271,2503.252886,2427.604167,2348.993316,2556.237503,3069.581639,3158.694858,2490.55358,2479.803864,3427.662037,2815.593875,2636.211052,2556.237503,121.0,0.820333,0.739403,0.149667
1,2450.623806,2540.49101,2592.33279,2640.043972,2509.793159,2812.587413,2651.014066,2489.108495,2700.89957,2924.288107,2692.359176,2612.031305,2614.003638,100.0,0.307784,0.25492,0.143325
2,2569.167537,2646.808979,2395.588235,2532.195122,2626.547123,3262.248898,3158.694858,2729.54235,2479.803864,3406.266491,2692.359176,2369.965462,2792.881615,91.0,1.097689,0.49767,1.040844
3,2569.167537,2595.146199,2400.15015,2780.896042,2540.333848,2249.901768,2225.773817,2470.895522,2479.803864,1988.479624,2276.033233,2612.031305,2565.461091,74.0,0.36281,1.596397,1.075525
4,2569.167537,2595.146199,2400.15015,2268.56436,2542.787517,2449.32243,2651.014066,2543.250298,2479.803864,2665.350877,2815.593875,2636.211052,2631.5261,121.0,0.44898,1.461487,0.714777
5,2569.167537,2646.808979,2395.588235,2229.881779,2629.794822,2351.327643,2651.014066,2490.55358,2479.803864,2085.797101,2276.033233,2369.965462,2176.899154,67.0,0.936153,0.696375,0.815124
6,2558.82271,2665.555556,2654.294479,2590.503432,2620.45403,2216.537301,2225.773817,2618.13052,2742.515391,2390.219224,2815.593875,2709.678998,2616.033755,110.0,1.741423,0.475512,0.334461
7,2570.667785,2638.489123,2602.085865,3081.579221,2730.005494,2755.309735,2651.014066,2470.895522,2742.515391,2985.820896,2815.593875,3150.404313,3224.684165,108.0,0.523289,0.419289,0.387834
8,2569.167537,2640.890551,2873.930481,2820.173278,2649.937911,2339.852217,2225.773817,2680.680614,2742.515391,2612.492565,2892.857143,2709.678998,2671.955887,133.0,0.316459,0.464394,0.24738
9,2737.201353,2585.947712,2585.947712,2480.479822,2808.073641,3262.248898,3158.694858,2729.54235,2421.705462,3611.022044,2815.593875,2444.223986,2781.18844,120.0,1.97324,1.061466,0.32234


In [54]:
# normalization

normalized_test_features = scaler.transform(encoded_test_set)

In [55]:
# lasso regressor prediction
lasso_y_pred = lasso.predict(normalized_test_features).reshape(-1, 1)
lasso_y_pred = y_scaler.inverse_transform(lasso_y_pred)

In [56]:
lasso_y_pred[:20]

array([[3169.49872881],
       [2660.21570296],
       [3447.25804036],
       [2132.25134339],
       [2526.07298406],
       [2242.69439036],
       [2284.17440124],
       [3234.11512858],
       [2577.1222796 ],
       [3878.49904987],
       [2062.55337703],
       [2863.49845543],
       [2779.27757499],
       [2687.82559035],
       [2350.0958965 ],
       [2754.63212135],
       [2060.46736397],
       [2627.57866206],
       [2483.04233349],
       [2811.7781502 ]])

In [57]:
df = pd.DataFrame(lasso_y_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_lasso.csv", index_label="Id", header=True, float_format='%.4f')

In [58]:
# Gradient Boosting Regressor prediction
gbr_y_pred = gbr.predict(normalized_test_features).reshape(-1, 1)
gbr_y_pred = y_scaler.inverse_transform(gbr_y_pred)

In [59]:
gbr_y_pred[:20]

array([[2886.71674548],
       [2495.1654708 ],
       [3456.5604013 ],
       [2016.25979698],
       [2520.23411509],
       [2243.87263907],
       [2258.89480432],
       [3354.78835519],
       [2557.97609179],
       [4279.6502066 ],
       [2110.25695084],
       [2756.40516839],
       [2973.22359469],
       [2758.6464102 ],
       [2307.02950048],
       [2844.54687903],
       [2097.97909881],
       [2678.3330577 ],
       [2457.48009184],
       [2522.28286326]])

In [60]:
df = pd.DataFrame(gbr_y_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_gbr.csv", index_label="Id", header=True, float_format='%.4f')

### XGBoost Regression

In [61]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [62]:
X, y = normalized_features, target_variable

In [63]:
# Split the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [64]:
# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

#Printing basic information about the DMatrix
print("Number of training samples in DMatrix:", dtrain_reg.num_row())
print("Number of features in Dmatrix:", dtrain_reg.num_col())

Number of training samples in DMatrix: 45000
Number of features in Dmatrix: 17


In [65]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",
}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [66]:
feature_score = model.get_score(importance_type="gain")
print(feature_score)

{'f0': 2.704887866973877, 'f1': 2.894948720932007, 'f2': 1.6756854057312012, 'f3': 1.8568716049194336, 'f4': 25.544483184814453, 'f5': 4.027935028076172, 'f6': 7.189028263092041, 'f7': 1.2777165174484253, 'f8': 4.500386714935303, 'f9': 47.62873077392578, 'f10': 5.025144100189209, 'f11': 2.5963199138641357, 'f12': 4.8141937255859375, 'f13': 2.1805896759033203, 'f14': 1.7063002586364746, 'f15': 1.7689251899719238, 'f16': 1.5675339698791504}


In [67]:
preds = model.predict(dtest_reg)

In [68]:
# compute and print accuracy score

rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print(f"RMSE of the base model: {rmse:.3f}")
print(f"R2 score of the base model: {r2:.3f}")

RMSE of the base model: 0.649
R2 score of the base model: 0.578


#### Early stopping
We can test our model at each step and see if adding a new tree/round improves performance. To do so, we define a test dataset and a metric that is used to assess performance at each round. If performance haven’t improved for N rounds (N is defined by the variable early_stopping_round), we stop the training and keep the best number of boosting rounds.

In [69]:
params = {
    "objective": "reg:squarederror",
}
n = 100

# evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]
evals = [(dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=2, # Every ten rounds
   early_stopping_rounds=10
)

[0]	validation-rmse:0.83666
[2]	validation-rmse:0.69238
[4]	validation-rmse:0.64932
[6]	validation-rmse:0.63763
[8]	validation-rmse:0.63393
[10]	validation-rmse:0.63275
[12]	validation-rmse:0.63221
[14]	validation-rmse:0.63253
[16]	validation-rmse:0.63268
[18]	validation-rmse:0.63269
[20]	validation-rmse:0.63283
[21]	validation-rmse:0.63311


#### XGBoost Cross Validation

In [70]:
params = {
    "objective": "reg:squarederror",
}
n = 100

results = xgb.cv(
   params, 
   dtrain_reg,
   num_boost_round=n,
   seed=42,
   nfold=5,
   metrics={'rmse'},
   early_stopping_rounds=20
)

In [71]:
best_rmse = results['test-rmse-mean'].min()
print(best_rmse)

0.6362644247537881


In [72]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,0.838628,0.001113,0.841377,0.003411
1,0.742887,0.000406,0.748738,0.003719
2,0.688714,0.00077,0.697374,0.003444
3,0.658079,0.000828,0.66906,0.003449
4,0.640461,0.000662,0.653977,0.003612


#### Optimize XGBoost Model using GridSearch
GridSearch is a method used for hyperparameter tuning where one specifies a subset of possible values for each hyperparameter of interest. The method then exhaustively tries out all possible combinations of these hyperparameters to find the combination that produces the best model performance, according to some metric.

##### max_depth & min_child_weight

In [73]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]
num_boost_round = 100

In [74]:
# Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best RMSE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))


CV with max_depth=9, min_child_weight=5
	RMSE 0.6502624371148253 for 7 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 0.6507908699962308 for 7 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 0.6498794643178873 for 7 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 0.6577788766736965 for 7 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 0.6564725110794445 for 7 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 0.6552226876039414 for 7 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 0.6644275351998234 for 6 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 0.664507820507153 for 7 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 0.6641617435086413 for 6 rounds
Best params: 9, 7, RMSE: 0.6498794643178873


In [75]:
params['max_depth'] = 9
params['min_child_weight'] = 7

##### gamma

In [76]:
min_rmse = float("Inf")
best_params = None
for gamma in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
    print("CV with gamma={}".format(gamma))
    # We update our parameters
    params['gamma'] = gamma
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['rmse'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = gamma
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

CV with gamma=0.1
	RMSE 0.6498931594616948 for 7 rounds

CV with gamma=0.2
	RMSE 0.6493063937587985 for 7 rounds

CV with gamma=0.3
	RMSE 0.6490956190407248 for 7 rounds

CV with gamma=0.4
	RMSE 0.6490031383978109 for 8 rounds

CV with gamma=0.5
	RMSE 0.6495015004312441 for 8 rounds

CV with gamma=0.6
	RMSE 0.6494134123603871 for 8 rounds

Best params: 0.4, rmse: 0.6490031383978109


In [77]:
params['gamma'] = 0.4

##### subsample & colsample

In [78]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [79]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, rmse: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	RMSE 0.6490031383978109 for 8 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 0.6478708788668337 for 8 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 0.6479572339339199 for 8 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 0.6479778045517759 for 8 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 0.649672669218808 for 7 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 0.6476089974498016 for 8 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 0.6486833601081337 for 8 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 0.6493337053548769 for 8 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 0.6506450302533027 for 8 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 0.6501997877901957 for 8 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 0.648694100513421 for 8 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 0.6488746689545661 for 8 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 0.6515659589275992 for 7 rounds
CV with subsample=0.7, cols

In [80]:
params['subsample'] = .9
params['colsample_bytree'] = .9

##### alpha & lumbda

In [81]:
gridsearch_params = {
    'reg_alpha': [0.05, 0.1, 1, 2, 3]
}

In [82]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for reg_alpha in gridsearch_params['reg_alpha']:
        print("CV with reg_alpha={}".format(
                                 reg_alpha))
        # We update our parameters
        params['reg_alpha'] = reg_alpha
        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain_reg,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics={'rmse'},
            early_stopping_rounds=10
        )
        # Update best score
        mean_rmse = cv_results['test-rmse-mean'].min()
        boost_rounds = cv_results['test-rmse-mean'].argmin()
        print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
        if mean_rmse < min_rmse:
            min_rmse = mean_rmse
            best_params = reg_alpha
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

CV with reg_alpha=0.05
	RMSE 0.6488210661153069 for 8 rounds
CV with reg_alpha=0.1
	RMSE 0.6484553885084383 for 7 rounds
CV with reg_alpha=1
	RMSE 0.6480607081454384 for 8 rounds
CV with reg_alpha=2
	RMSE 0.6457240999543731 for 8 rounds
CV with reg_alpha=3
	RMSE 0.6445434514346117 for 8 rounds
Best params: 3, rmse: 0.6445434514346117


In [83]:
params['alpha'] = 3

In [84]:
%time

min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time 
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['rmse'],
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, rmse: {}".format(best_params, min_rmse))

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 2.86 µs
CV with eta=0.3
CPU times: user 9 µs, sys: 1 µs, total: 10 µs
Wall time: 3.1 µs
	RMSE 0.6445434514346117 for 8 rounds

CV with eta=0.2
CPU times: user 12 µs, sys: 5 µs, total: 17 µs
Wall time: 3.81 µs
	RMSE 0.6421750619784514 for 15 rounds

CV with eta=0.1
CPU times: user 14 µs, sys: 5 µs, total: 19 µs
Wall time: 4.29 µs
	RMSE 0.639964887391508 for 32 rounds

CV with eta=0.05
CPU times: user 10 µs, sys: 3 µs, total: 13 µs
Wall time: 3.81 µs
	RMSE 0.639249234924217 for 73 rounds

CV with eta=0.01
CPU times: user 6 µs, sys: 7 µs, total: 13 µs
Wall time: 4.05 µs
	RMSE 0.7032565293193256 for 99 rounds

CV with eta=0.005
CPU times: user 1e+03 ns, sys: 5 µs, total: 6 µs
Wall time: 4.05 µs
	RMSE 0.7967112630279969 for 99 rounds

Best params: 0.05, rmse: 0.639249234924217


In [85]:
params['eta'] = .05

In [86]:
params

{'objective': 'reg:squarederror',
 'max_depth': 9,
 'min_child_weight': 7,
 'gamma': 0.4,
 'subsample': 0.9,
 'colsample_bytree': 0.9,
 'reg_alpha': 3,
 'alpha': 3,
 'eta': 0.05}

In [87]:
model = xgb.train(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:0.96974
[1]	Test-rmse:0.94256
[2]	Test-rmse:0.91738
[3]	Test-rmse:0.89403
[4]	Test-rmse:0.87232
[5]	Test-rmse:0.85229
[6]	Test-rmse:0.83369
[7]	Test-rmse:0.81666
[8]	Test-rmse:0.80097
[9]	Test-rmse:0.78643
[10]	Test-rmse:0.77312
[11]	Test-rmse:0.76078
[12]	Test-rmse:0.74959
[13]	Test-rmse:0.74025
[14]	Test-rmse:0.73163
[15]	Test-rmse:0.72281
[16]	Test-rmse:0.71460
[17]	Test-rmse:0.70712
[18]	Test-rmse:0.70107
[19]	Test-rmse:0.69564
[20]	Test-rmse:0.68980
[21]	Test-rmse:0.68455
[22]	Test-rmse:0.67968
[23]	Test-rmse:0.67529
[24]	Test-rmse:0.67134
[25]	Test-rmse:0.66768
[26]	Test-rmse:0.66431
[27]	Test-rmse:0.66143
[28]	Test-rmse:0.65883
[29]	Test-rmse:0.65630
[30]	Test-rmse:0.65417
[31]	Test-rmse:0.65243
[32]	Test-rmse:0.65047
[33]	Test-rmse:0.64867
[34]	Test-rmse:0.64710
[35]	Test-rmse:0.64595
[36]	Test-rmse:0.64465
[37]	Test-rmse:0.64351
[38]	Test-rmse:0.64246
[39]	Test-rmse:0.64150
[40]	Test-rmse:0.64056
[41]	Test-rmse:0.63978
[42]	Test-rmse:0.63911
[43]	Test-rmse:0.6385

In [88]:
print("Best RMSE: {:.3f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best RMSE: 0.634 in 66 rounds


#### Saving the best model

In [89]:
dtrain = xgb.DMatrix(normalized_features, target_variable, enable_categorical=True)
dtest = xgb.DMatrix(normalized_test_features, enable_categorical=True)

In [90]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")]
)

[0]	Test-rmse:0.96903
[1]	Test-rmse:0.94122
[2]	Test-rmse:0.91525
[3]	Test-rmse:0.89111
[4]	Test-rmse:0.86867
[5]	Test-rmse:0.84788
[6]	Test-rmse:0.82861
[7]	Test-rmse:0.81072
[8]	Test-rmse:0.79415
[9]	Test-rmse:0.77858
[10]	Test-rmse:0.76437
[11]	Test-rmse:0.75110
[12]	Test-rmse:0.73888
[13]	Test-rmse:0.72848
[14]	Test-rmse:0.71897
[15]	Test-rmse:0.70909
[16]	Test-rmse:0.69996
[17]	Test-rmse:0.69155
[18]	Test-rmse:0.68462
[19]	Test-rmse:0.67830
[20]	Test-rmse:0.67146
[21]	Test-rmse:0.66510
[22]	Test-rmse:0.65924
[23]	Test-rmse:0.65386
[24]	Test-rmse:0.64900
[25]	Test-rmse:0.64449
[26]	Test-rmse:0.64021
[27]	Test-rmse:0.63618
[28]	Test-rmse:0.63259
[29]	Test-rmse:0.62927
[30]	Test-rmse:0.62618
[31]	Test-rmse:0.62359
[32]	Test-rmse:0.62074
[33]	Test-rmse:0.61809
[34]	Test-rmse:0.61551
[35]	Test-rmse:0.61353
[36]	Test-rmse:0.61145
[37]	Test-rmse:0.60943
[38]	Test-rmse:0.60760
[39]	Test-rmse:0.60594
[40]	Test-rmse:0.60431
[41]	Test-rmse:0.60275
[42]	Test-rmse:0.60136
[43]	Test-rmse:0.5999

In [91]:
submission_test_pred = best_model.predict(dtest).reshape(-1, 1)
submission_test_pred = y_scaler.inverse_transform(submission_test_pred)

In [92]:
df = pd.DataFrame(submission_test_pred, columns=['Predicted'])
df.to_csv("predictions/updated_main_features_team18_xgb.csv", index_label="Id", header=True, float_format='%.4f')