In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders import TargetEncoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import precision_recall_curve

import sklearn.metrics as sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyRegressor

from sklearn.tree import DecisionTreeRegressor, export_graphviz

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from imblearn.under_sampling import RandomUnderSampler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from xgboost import XGBRegressor

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data

df = pd.read_csv('Data/df_final.csv',index_col = 0)

# drop the columns that includes only one value
# and categorical variables that has too many values
df = df.drop(columns = ['israteperstay'])

# convert to some binary features to bool
df['ispromo']  = (df['ispromo']== 'Y')

df['Source']  = (df['Source']== 5)

# Use target encoding to transform the categorical variables that have too many unique values
encoder = TargetEncoder()
df[['roomtype','city','country','ratetype','propertytype']] = encoder.fit_transform(df[['roomtype',
                                                                                        'city',
                                                                                        'country',
                                                                                        'ratetype',
                                                                                        'propertytype']], 
                                                                                    df['price'])

# convert the boolean terms into integers
df = df*1

In [None]:
# extract the feature and target from the dataset
X_dev = df.loc[:, df.columns != 'price']
y_dev = df.loc[:, df.columns == 'price']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_dev, 
                                                    y_dev, 
                                                    test_size=0.2,  
                                                    random_state=123)

## Baseline Regressor

In [None]:
# Create a baseline regressor to examine advanced models
dummy_regr = DummyRegressor(strategy="median")

dummy_regr.fit(X_train, y_train)

train_mae_dummy = sm.mean_absolute_error(y_train, 
                                   dummy_regr.predict(X_train))

test_mae_dummy = sm.mean_absolute_error(y_test, 
                                  dummy_regr.predict(X_test))

print(f"The best training MAE is   : {round(train_mae_dummy,3)}")
print(f"The best testing MAE is    : {round(test_mae_dummy,3)}")

print(f"The training accuracy is   : {round(dummy_regr.score(X_train, y_train)*100,4)}%")
print(f"The training accuracy is   : {round(dummy_regr.score(X_test, y_test)*100,4)}%")

## Decision Tree Regressor

### Default Settings

In [None]:
# tree_para = {'max_depth':[25,50,75,100],
#              'max_features':[15,20,25,30]}

# clf_dt = GridSearchCV(DecisionTreeRegressor(random_state = 123), 
#                    tree_para, 
#                    cv=5)

# clf_dt.fit(X_train, y_train)

# train_mae_dt = sm.mean_absolute_error(y_train, 
#                                    clf_dt.best_estimator_.predict(X_train))

# test_mae_dt = sm.mean_absolute_error(y_test, 
#                                   clf_dt.best_estimator_.predict(X_test))

# print(f"The optimal parameter is   : {clf_dt.best_params_}")
# print(f"The best training MAE is   : {round(train_mae_dt,3)}")
# print(f"The best testing MAE is    : {round(test_mae_dt,3)}")

# print(f"The training accuracy is   : {round(clf_dt.score(X_train, y_train)*100,4)}%")
# print(f"The testing accuracy is    : {round(clf_dt.score(X_test, y_test)*100,4)}%")

# The optimal parameter is   : {'max_depth': 25, 'max_features': 15}

In [None]:
dt = DecisionTreeRegressor(random_state = 123, 
                           max_depth= 25, 
                           max_features= 15)

dt.fit(X_train, y_train)

train_mae_dt = sm.mean_absolute_error(y_train, 
                                      dt.best_estimator_.predict(X_train))

test_mae_dt = sm.mean_absolute_error(y_test, 
                                     dt.best_estimator_.predict(X_test))

print(f"The best training MAE is   : {round(train_mae_dt,3)}")
print(f"The best testing MAE is    : {round(test_mae_dt,3)}")

print(f"The training accuracy is   : {round(dt.score(X_train, y_train)*100,4)}%")
print(f"The testing accuracy is    : {round(dt.score(X_test, y_test)*100,4)}%")

### Feature Selection
Consisting with the results above, we could find that there is an obvious overfitting existing in the model above (the training accuracy greatly exceeds the testing accuracy). Thus, some techniques will be implemented to avoid the overfitting.

In [None]:
feature_importance = clf_dt.best_estimator_.feature_importances_

k = 5

idx = np.argpartition(feature_importance, k)

# drop the 5 least important features
X_train_simplified = X_train.drop(columns = X_train.columns[idx[:k]])

X_test_simplified = X_test.drop(columns = X_train.columns[idx[:k]])

In [None]:
# tree_para = {'max_depth':[25,50,75,100],
#              'max_features':[15,20,25,30]}

# clf_dt_simp = GridSearchCV(DecisionTreeRegressor(random_state = 123), 
#                    tree_para, 
#                    cv=5)

# clf_dt_simp.fit(X_train_simplified, y_train)

# train_mae_dt_simp = sm.mean_absolute_error(y_train, 
#                                    clf_dt_simp.best_estimator_.predict(X_train_simplified))

# test_mae_dt_simp = sm.mean_absolute_error(y_test, 
#                                   clf_dt_simp.best_estimator_.predict(X_test_simplified))

# print(f"The optimal parameter is   : {clf_dt_simp.best_params_}")
# print(f"The best training MAE is : {round(train_mae_dt_simp,3)}")
# print(f"The best testing MAE is  : {round(test_mae_dt_simp,3)}")

# print(f"The training accuracy is   : {round(clf_dt_simp.score(X_train_simplified , y_train)*100,4)}%")
# print(f"The testing accuracy is    : {round(clf_dt_simp.score(X_test_simplified, y_test)*100,4)}%")

# The optimal parameter is   : {'bootstrap': False, 'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 5}

In [None]:
dt_simp = DecisionTreeRegressor(random_state = 123, 
                                max_depth = 25, 
                                max_features= 15)

dt_simp.fit(X_train, y_train)

train_mae_dt_simp = sm.mean_absolute_error(y_train, 
                                      dt.best_estimator_.predict(X_train_simp))

test_mae_dt_simp = sm.mean_absolute_error(y_test, 
                                     dt.best_estimator_.predict(X_test_simp))

print(f"The best training MAE is   : {round(train_mae_dt_simp,3)}")
print(f"The best testing MAE is    : {round(test_mae_dt_simp,3)}")

print(f"The training accuracy is   : {round(dt.score(X_train_simp, y_train)*100,4)}%")
print(f"The testing accuracy is    : {round(dt.score(X_test_simp, y_test)*100,4)}%")

Thus, we could see that simply the feature selection could not solve the issue of overfitting. Thus, another accessible method to reduce the overfitting is to use an ensemble model.

## Random Forest Regressor

### Default Model

In [None]:
# tree_para = {'max_depth':[25,50,75,100,125,150],
#              'max_features':['auto', 'sqrt', 'log2'],
#              'min_samples_split': [2, 5, 10],
#              'bootstrap':[True, False]
#              }

# clf_rf = GridSearchCV(RandomForestRegressor(random_state = 123), 
#                    tree_para, 
#                    cv=5)

# clf_rf.fit(X_train, y_train)

# train_mae_rf = sm.mean_absolute_error(y_train, 
#                                       clf_rf.best_estimator_.predict(X_train))

# test_mae_rf = sm.mean_absolute_error(y_test, 
#                                      clf_rf.best_estimator_.predict(X_test))

# print(f"The optimal parameter is   : {clf_rf.best_params_}")
# print(f"The best training MAE is   : {round(train_mae_rf,3)}")
# print(f"The best testing MAE is    : {round(test_mae_rf,3)}")

# print(f"The training accuracy is   : {round(clf_rf.score(X_train_simplified, y_train)*100,4)}%")
# print(f"The testing accuracy is    : {round(clf_rf.score(X_test_simplified, y_test)*100,4)}%")

# The optimal parameter is   : {'bootstrap': False, 'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 5}

In [None]:
# Use the complete features to train the random forest model
rf = RandomForestRegressor(random_state = 123, 
                           bootstrap = False, 
                           max_depth = 50, 
                           max_features = 'log2',
                           min_samples_split = 5)

rf.fit(X_train, y_train)

train_mae_rf = sm.mean_absolute_error(y_train, 
                                      rf.predict(X_train))

test_mae_rf = sm.mean_absolute_error(y_test, 
                                     rf.predict(X_test))

print(f"The best training MAE is   : {round(train_mae_rf,3)}")
print(f"The best testing MAE is    : {round(test_mae_rf,3)}")

print(f"The training accuracy is   : {round(rf.score(X_train, y_train)*100,4)}%")
print(f"The testing accuracy is    : {round(rf.score(X_test, y_test)*100,4)}%")

In [None]:
tree_para = {'n_estimators':[25,50,75,100],
             'max_depth':[25,50,75,100],
             }

clf_gbr = GridSearchCV(GradientBoostingRegressor(random_state = 123), 
                   tree_para, 
                   cv=5)

clf_gbr.fit(X_train, y_train)

train_mae_gbr = sm.mean_absolute_error(y_train, 
                                   clf_gbr.best_estimator_.predict(X_train_simplified))

test_mae_gbr = sm.mean_absolute_error(y_test, 
                                  clf_gbr.best_estimator_.predict(X_test_simplified))

print(f"The optimal parameter is   : {clf_gbr.best_params_}")
print(f"The best training MAE is : {round(train_mae_gbr,3)}")
print(f"The best testing MAE is  : {round(test_mae_gbr,3)}")

In [None]:
tree_para = {'n_estimators':[25,50,75,100], 
             'max_depth':[25,50,75,100], 
             'eta':[0.01, 0.05]
             }

clf_xgb = GridSearchCV(XGBRegressor(random_state = 123,
                                verbosity = 0), 
                   tree_para, 
                   cv=5)

clf_xgb.fit(X_train, y_train)

train_mae_xgb = sm.mean_absolute_error(y_train, 
                                   clf_xgb.best_estimator_.predict(X_train))

test_mae_xgb = sm.mean_absolute_error(y_test, 
                                  clf_xgb.best_estimator_.predict(X_test))

print(f"The optimal parameter is   : {clf_xgb.best_params_}")
print(f"The best training MAE is : {round(train_mae_xgb,3)}")
print(f"The best testing MAE is  : {round(test_mae_xgb,3)}")
