In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_validate

# For result evaluation
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression, Lasso, ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load processed data
df_processed = pd.read_pickle('./processed_data_scaled.p')
# df_singa_airbnb = pd.read_csv('listings.csv')

col_train = [x for x in df_processed.columns if x not in ['name', 'host_name', 'last_review', 'last_review_date', 'price', 'id', 'host_name', 'host_id', 'last_review_year', 'last_review_month', 'last_review_week', 'last_review_day', 'last_review_dayofweek']]

In [3]:
X = df_processed[col_train].values
y = df_processed['price'].values

In [4]:
random_state = 0

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [6]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
def print_regression_scores(clf, x_train, y_train, x_test, y_test, display = 1, cv = 10):
    
    dict_result = {}
    cross_val = StratifiedShuffleSplit(n_splits = cv, random_state=random_state)

    for name, feat, tar in [("TRAIN set", x_train, y_train), ("TEST set", x_test, y_test)]:
        explained_variance = explained_variance_score(tar, clf.predict(feat))
        max_err = max_error(tar, clf.predict(feat))
        mean_absolute_err = mean_absolute_error(tar, clf.predict(feat))
        root_mean_squared_err = sqrt(mean_squared_error(tar, clf.predict(feat)))
        median_absolute_err = median_absolute_error(tar, clf.predict(feat))
        r2 = r2_score(tar, clf.predict(feat))
        mean_absolute_percentage_err = mean_absolute_percentage_error(tar, clf.predict(feat))
               
        if display == 1:
            print("{} explained variance score: ".format(name), explained_variance)
            print("{} max error: ".format(name), max_err)
            print("{} mean absolute error: ".format(name), mean_absolute_err)
            print("{} root mean squared error: ".format(name), root_mean_squared_err)
            print("{} median absolute error: ".format(name), median_absolute_err)
            print("{} r2 score: ".format(name), r2)
            
            
            print("{} mean_absolute percentage error : ".format(name), mean_absolute_percentage_err, end = '\n\n')
        
        dict_result[name] = [explained_variance, max_err, mean_absolute_err, root_mean_squared_err, median_absolute_err, r2, mean_absolute_percentage_err]
        
    return dict_result

In [7]:
# def print_classification_scores(clf, x_train, y_train, x_test, y_test, display = 1, cv = 10):
    
#     dict_result = {}
#     cross_val = StratifiedShuffleSplit(n_splits = cv, random_state=random_state)

#     for name, feat, tar in [("TRAIN set", x_train, y_train), ("TEST set", x_test, y_test)]:
#         f1_macro = f1_score(tar, clf.predict(feat), average = 'macro')
#         recall_macro = recall_score(tar, clf.predict(feat), average = 'macro')
#         precision_macro = precision_score(tar, clf.predict(feat), average = 'macro')
#         balanced_accuracy = balanced_accuracy_score(tar, clf.predict(feat))
               
#         if display == 1:
#             print("{} balanced_accuracy score: ".format(name), balanced_accuracy)
#             print("{} recall score: ".format(name), recall_macro)
#             print("{} precision score: ".format(name), precision_macro)
#             print("{} f1_macro score: ".format(name), f1_macro, end = '\n\n')
        
#         dict_result[name] = [balanced_accuracy, recall_macro, precision_macro, f1_macro]
        
#     return dict_result

### 1. Linear regression

In [8]:
lr = LinearRegression()

reg = lr.fit(x_train,y_train)
reg.score(x_test, y_test)

0.42271188574251706

In [9]:
dict_score = print_regression_scores(lr, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.4266940658067121
TRAIN set max error:  458.36021637206056
TRAIN set mean absolute error:  52.47122954762088
TRAIN set root mean squared error:  70.43651843986052
TRAIN set median absolute error:  42.38500413062427
TRAIN set r2 score:  0.426694065806712
TRAIN set mean_absolute percentage error :  162393847407784.06

TEST set explained variance score:  0.4231560304384261
TEST set max error:  349.2675590763726
TEST set mean absolute error:  52.16358264524778
TEST set root mean squared error:  69.8153060675741
TEST set median absolute error:  41.67061030080899
TEST set r2 score:  0.42271188574251706
TEST set mean_absolute percentage error :  0.48857349596781613



### 2. Lasso

In [12]:
ls = Lasso()
ls_reg = ls.fit(x_train,y_train)
ls_reg.score(x_test, y_test)

0.40943192358818326

In [14]:
dict_score = print_regression_scores(ls, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.3875767288816402
TRAIN set max error:  462.3266299094923
TRAIN set mean absolute error:  54.17010993065625
TRAIN set root mean squared error:  72.79985348081297
TRAIN set median absolute error:  43.90136310550511
TRAIN set r2 score:  0.3875767288816402
TRAIN set mean_absolute percentage error :  148797716868153.4

TEST set explained variance score:  0.40998732666217785
TEST set max error:  329.6670618585535
TEST set mean absolute error:  52.7390627808669
TEST set root mean squared error:  70.61375751798226
TEST set median absolute error:  42.42623359519371
TEST set r2 score:  0.40943192358818326
TEST set mean_absolute percentage error :  0.4941572553461187



### 3. ElasticNet

In [15]:
em = ElasticNet()
em_reg = em.fit(x_train,y_train)
em_reg.score(x_test, y_test)

0.26400547900467786

In [16]:
dict_score = print_regression_scores(em, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.2555559963620111
TRAIN set max error:  393.86834657717407
TRAIN set mean absolute error:  62.01428942843103
TRAIN set root mean squared error:  80.26397812359083
TRAIN set median absolute error:  52.5176671524987
TRAIN set r2 score:  0.255555996362011
TRAIN set mean_absolute percentage error :  122330859382428.44

TEST set explained variance score:  0.26430715378123304
TEST set max error:  334.9733515116078
TEST set mean absolute error:  61.02098632647356
TEST set root mean squared error:  78.83001858948364
TEST set median absolute error:  53.085647766645565
TEST set r2 score:  0.26400547900467786
TEST set mean_absolute percentage error :  0.683143109512785



### 4. Ridge

In [17]:
rm = Ridge()
rm_reg = rm.fit(x_train,y_train)
rm_reg.score(x_test, y_test)

0.42390787687287623

In [19]:
dict_score = print_regression_scores(rm, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.4259562158149808
TRAIN set max error:  458.8870295527021
TRAIN set mean absolute error:  52.483962552379815
TRAIN set root mean squared error:  70.48183008781785
TRAIN set median absolute error:  42.31683474547436
TRAIN set r2 score:  0.4259562158149808
TRAIN set mean_absolute percentage error :  162123940833959.66

TEST set explained variance score:  0.42434842744271983
TEST set max error:  349.298441302116
TEST set mean absolute error:  52.15155477995522
TEST set root mean squared error:  69.74294897549916
TEST set median absolute error:  41.69689346382427
TEST set r2 score:  0.42390787687287623
TEST set mean_absolute percentage error :  0.4881039666429797



### 5. SVR

In [10]:
svr_linear = SVR(kernel='linear')
svr_linear_reg = svr_linear.fit(x_train,y_train)
svr_linear_reg.score(x_test, y_test)

0.37133056264850284

In [11]:
dict_score = print_regression_scores(svr_linear, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.39846709370479916
TRAIN set max error:  464.1919413733709
TRAIN set mean absolute error:  50.80467167099904
TRAIN set root mean squared error:  74.89192816988928
TRAIN set median absolute error:  33.785387096766335
TRAIN set r2 score:  0.3518721274760055
TRAIN set mean_absolute percentage error :  137656908333501.27

TEST set explained variance score:  0.41104364218574563
TEST set max error:  346.62294151577515
TEST set mean absolute error:  49.1644391187091
TEST set root mean squared error:  72.85603164142091
TEST set median absolute error:  31.540175292183676
TEST set r2 score:  0.37133056264850284
TEST set mean_absolute percentage error :  0.39043576420764514



In [14]:
svr_poly = SVR(kernel='poly')
svr_poly_reg = svr_poly.fit(x_train,y_train)
svr_poly_reg.score(x_test, y_test)

0.07979650528753257

In [15]:
dict_score = print_regression_scores(svr_poly, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.14460695902352838
TRAIN set max error:  382.5152907647169
TRAIN set mean absolute error:  63.438578568674494
TRAIN set root mean squared error:  89.35008427306221
TRAIN set median absolute error:  46.625986308315404
TRAIN set r2 score:  0.07746976785757531
TRAIN set mean_absolute percentage error :  92561636635289.06

TEST set explained variance score:  0.14374671273247053
TEST set max error:  374.8934846262342
TEST set mean absolute error:  63.63532926956606
TEST set root mean squared error:  88.14471491267511
TEST set median absolute error:  47.87149039954281
TEST set r2 score:  0.07979650528753257
TEST set mean_absolute percentage error :  0.6129086425000494



### 6. RandomForest Regressor

In [12]:
rfr = RandomForestRegressor(max_depth=10, random_state=0)
rfr_reg = rfr.fit(x_train,y_train)
rfr_reg.score(x_test, y_test)

0.6144167918410821

In [13]:
dict_score = print_regression_scores(rfr, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.7575451069736288
TRAIN set max error:  262.58763656049086
TRAIN set mean absolute error:  32.31988915988531
TRAIN set root mean squared error:  45.806134102026874
TRAIN set median absolute error:  22.61587988996699
TRAIN set r2 score:  0.7575411981174218
TRAIN set mean_absolute percentage error :  149948378560821.4

TEST set explained variance score:  0.6144183227762298
TEST set max error:  318.9079047777512
TEST set mean absolute error:  39.34745094438191
TEST set root mean squared error:  57.05758613599761
TEST set median absolute error:  26.44838926429648
TEST set r2 score:  0.6144167918410821
TEST set mean_absolute percentage error :  0.3357111253461821



### Logistic regression

with l2 penalty

In [22]:
# These models can be trained by GridSearchCV to get an ideas of the performance
# But it takes a while to train these models, so I am just breaking them to see the results
# as each model is trained 


lgr_l2 = LogisticRegression()

# param_grid = {'penalty': ['none', 'l1', 'l2', 'elasticnet']}

# clf = GridSearchCV(lgr, param_grid=param_grid, scoring='r2', n_jobs=1, cv=5)
lgr_l2_reg = lgr_l2.fit(x_train, y_train)
lgr_l2_reg.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.08132726089785296

In [23]:
print_regression_scores(lgr_l2_reg, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.24503470924519732
TRAIN set max error:  450
TRAIN set mean absolute error:  51.939492517892
TRAIN set root mean squared error:  81.55744071100173
TRAIN set median absolute error:  28.0
TRAIN set r2 score:  0.2313690773386775
TRAIN set mean_absolute percentage error :  146506168749853.84

TEST set explained variance score:  0.21465417432272937
TEST set max error:  391
TEST set mean absolute error:  53.47234873129473
TEST set root mean squared error:  82.02909412927517
TEST set median absolute error:  30.0
TEST set r2 score:  0.20305719432387248
TEST set mean_absolute percentage error :  0.4009693243029781



{'TRAIN set': [0.24503470924519732,
  450,
  51.939492517892,
  81.55744071100173,
  28.0,
  0.2313690773386775,
  146506168749853.84],
 'TEST set': [0.21465417432272937,
  391,
  53.47234873129473,
  82.02909412927517,
  30.0,
  0.20305719432387248,
  0.4009693243029781]}

With no penalty

In [None]:
lgr = LogisticRegression(penalty='none', max_iter=1000, solver='saga')

# param_grid = {'penalty': ['none', 'l1', 'l2', 'elasticnet']}

# clf = GridSearchCV(lgr, param_grid=param_grid, scoring='r2', n_jobs=1, cv=5)
lgr_reg = lgr.fit(x_train, y_train)
lgr_reg.score(x_test, y_test)

In [None]:
print_regression_scores(lgr_reg, x_train, y_train, x_test, y_test)

With l1 penalty

In [None]:
lgr_l1 = LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear')

# param_grid = {'penalty': ['none', 'l1', 'l2', 'elasticnet']}

# clf = GridSearchCV(lgr, param_grid=param_grid, scoring='r2', n_jobs=1, cv=5)
lgr_l1_reg = lgr_l1.fit(x_train, y_train)
lgr_l1_reg.score(x_test, y_test)

In [None]:
print_regression_scores(lgr_l1reg, x_train, y_train, x_test, y_test)

with Elasticnet (both l1 and l2)

In [None]:
lgr_l1 = LogisticRegression(penalty='l1', max_iter=1000, solver='saga')

# param_grid = {'penalty': ['none', 'l1', 'l2', 'elasticnet']}

# clf = GridSearchCV(lgr, param_grid=param_grid, scoring='r2', n_jobs=1, cv=5)
lgr_l1_reg = lgr_l1.fit(x_train, y_train)
lgr_l1_reg.score(x_test, y_test)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor