In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_validate

# For result evaluation
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression

from catboost import Pool, CatBoostRegressor, cv
from catboost.utils import eval_metric

In [2]:
# Load processed data
df_processed = pd.read_pickle('./processed_data.p')
# df_singa_airbnb = pd.read_csv('listings.csv')

col_train = [x for x in df_processed.columns if x not in ['name', 'host_name', 'last_review', 'last_review_date', 'price', 'id', 'host_name', 'host_id', 'last_review_year', 'last_review_month', 'last_review_week', 'last_review_day', 'last_review_dayofweek']]
# Removing latitude and longitude does not seem to improve model performance
# The price is heavily right-skewed and so removing high price values would make the model better

In [3]:
df_processed[col_train]

Unnamed: 0,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Central Region,neighbourhood_group_East Region,...,neighbourhood_Serangoon,neighbourhood_Singapore River,neighbourhood_Southern Islands,neighbourhood_Sungei Kadut,neighbourhood_Tampines,neighbourhood_Tanglin,neighbourhood_Toa Payoh,neighbourhood_Western Water Catchment,neighbourhood_Woodlands,neighbourhood_Yishun
0,1.44255,103.79580,2,180,1,0.01,2,365,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1.33235,103.78521,2,90,18,0.28,1,365,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1.44246,103.79667,2,6,20,0.20,2,365,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1.34541,103.95712,2,1,14,0.15,9,353,0,1,...,0,0,0,0,1,0,0,0,0,0
4,1.34567,103.95963,2,1,22,0.22,9,355,0,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7902,1.27973,103.78751,3,3,0,0.00,31,61,1,0,...,0,0,0,0,0,0,0,0,0,0
7903,1.29269,103.82623,3,6,0,0.00,34,365,1,0,...,0,0,0,0,0,1,0,0,0,0
7904,1.31286,103.85996,2,30,0,0.00,3,173,1,0,...,0,0,0,0,0,0,0,0,0,0
7905,1.29543,103.83801,2,14,0,0.00,2,30,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df_processed[col_train].values
y = df_processed['price'].values

In [5]:
# Split model into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Utilize pooling from Catboost
train_pool = Pool(X_train, y_train) 
test_pool = Pool(X_test, y_test)

In [6]:
# Specify hyperparameters for the model
params = {
    'iterations': 6000,
    'learning_rate': 0.002,
    'random_seed': 42,
    'logging_level': 'Silent',
    'early_stopping_rounds': 500
}

In [7]:
model = CatBoostRegressor(**params)

In [8]:
model.fit(
    train_pool, 
    eval_set=test_pool, 
    verbose=False, 
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1e5229d5850>

In [9]:
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("MAE: " + str(mae))
print("RMSE: " + str(rmse))
print("MSE: " + str(mse))
print("R2: " + str(r2))

MAE: 51.93745240080429
RMSE: 87.3828752541597
MSE: 7635.766887684036
R2: 0.48417466289376776
