In [20]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_validate

# For result evaluation
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression, SGDRegressor

In [67]:
# Load processed data
df_processed = pd.read_pickle('./processed_data.p')
# df_singa_airbnb = pd.read_csv('listings.csv')

col_train = [x for x in df_processed.columns if x not in ['name', 'host_name', 'last_review', 'last_review_date', 'price', 'id', 'host_name', 'host_id', 'last_review_year', 'last_review_month', 'last_review_week', 'last_review_day', 'last_review_dayofweek']]

In [68]:
X = df_processed[col_train].values
y = df_processed['price'].values

In [69]:
random_state = 0

In [70]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [71]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
def print_regression_scores(clf, x_train, y_train, x_test, y_test, display = 1, cv = 10):
    
    dict_result = {}
    cross_val = StratifiedShuffleSplit(n_splits = cv, random_state=random_state)

    for name, feat, tar in [("TRAIN set", x_train, y_train), ("TEST set", x_test, y_test)]:
        explained_variance = explained_variance_score(tar, clf.predict(feat))
        max_err = max_error(tar, clf.predict(feat))
        mean_absolute_err = mean_absolute_error(tar, clf.predict(feat))
        mean_squared_err = mean_squared_error(tar, clf.predict(feat))
        median_absolute_err = median_absolute_error(tar, clf.predict(feat))
        r2 = r2_score(tar, clf.predict(feat))
        mean_absolute_percentage_err = mean_absolute_percentage_error(tar, clf.predict(feat))
               
        if display == 1:
            print("{} explained variance score: ".format(name), explained_variance)
            print("{} max error: ".format(name), max_err)
            print("{} mean absolute error: ".format(name), mean_absolute_err)
            print("{} mean squared error: ".format(name), mean_squared_err)
            print("{} median absolute error: ".format(name), median_absolute_err)
            print("{} r2 score: ".format(name), r2)
            
            
            print("{} mean_absolute percentage error : ".format(name), mean_absolute_percentage_err, end = '\n\n')
        
        dict_result[name] = [explained_variance, max_err, mean_absolute_err, mean_squared_err, median_absolute_err, r2, mean_absolute_percentage_err]
        
    return dict_result

In [72]:
# def print_classification_scores(clf, x_train, y_train, x_test, y_test, display = 1, cv = 10):
    
#     dict_result = {}
#     cross_val = StratifiedShuffleSplit(n_splits = cv, random_state=random_state)

#     for name, feat, tar in [("TRAIN set", x_train, y_train), ("TEST set", x_test, y_test)]:
#         f1_macro = f1_score(tar, clf.predict(feat), average = 'macro')
#         recall_macro = recall_score(tar, clf.predict(feat), average = 'macro')
#         precision_macro = precision_score(tar, clf.predict(feat), average = 'macro')
#         balanced_accuracy = balanced_accuracy_score(tar, clf.predict(feat))
               
#         if display == 1:
#             print("{} balanced_accuracy score: ".format(name), balanced_accuracy)
#             print("{} recall score: ".format(name), recall_macro)
#             print("{} precision score: ".format(name), precision_macro)
#             print("{} f1_macro score: ".format(name), f1_macro, end = '\n\n')
        
#         dict_result[name] = [balanced_accuracy, recall_macro, precision_macro, f1_macro]
        
#     return dict_result

### 1. Linear regression

In [65]:
lr = LinearRegression()

reg = lr.fit(X,y)
reg.score(X,y)

0.04294767783294129

In [73]:
dict_result = print_regression_scores(lr, x_train, y_train, x_test, y_test)

TRAIN set explained variance score:  0.037464461240404145
TRAIN set max error:  9874.52927377299
TRAIN set mean absolute error:  92.67884313652527
TRAIN set mean squared error:  117565.89902415503
TRAIN set median absolute error:  59.997901123606965
TRAIN set r2 score:  0.037464386691688945
TRAIN set mean_absolute percentage error :  186736630576594.03

TEST set explained variance score:  0.07267511305487862
TEST set max error:  8556.0017625842
TEST set mean absolute error:  91.91424081082708
TEST set mean squared error:  83466.15824051782
TEST set median absolute error:  61.13396362731568
TEST set r2 score:  0.072673495961591
TEST set mean_absolute percentage error :  0.7243692716185841



In [49]:
reg.coef_

array([-5.48362503e-07,  1.50400679e-08,  1.71512606e+01,  1.51974121e+00,
       -2.24830801e+02, -2.06692233e+01, -1.13936995e+02, -1.68196831e+00,
       -6.33846903e+00,  1.08030983e+01, -8.89347086e+01,  3.20033677e+01,
       -5.66622934e+00, -2.27182539e+00,  3.81584331e-01,  1.92736699e+00,
        9.87894419e-01])

In [52]:
X

array([[4.90910000e+04, 2.66763000e+05, 2.00000000e+00, ...,
        4.30000000e+01, 2.10000000e+01, 0.00000000e+00],
       [5.06460000e+04, 2.27796000e+05, 0.00000000e+00, ...,
        5.20000000e+01, 2.60000000e+01, 4.00000000e+00],
       [5.63340000e+04, 2.66763000e+05, 2.00000000e+00, ...,
        4.00000000e+01, 1.00000000e+00, 3.00000000e+00],
       ...,
       [3.81093360e+07, 2.81448565e+08, 0.00000000e+00, ...,
        2.60000000e+01, 2.70000000e+01, 3.00000000e+00],
       [3.81104930e+07, 2.43835202e+08, 0.00000000e+00, ...,
        2.60000000e+01, 2.70000000e+01, 3.00000000e+00],
       [3.81127620e+07, 2.87885200e+07, 0.00000000e+00, ...,
        2.60000000e+01, 2.70000000e+01, 3.00000000e+00]])