In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import precision_recall_curve

import sklearn.metrics as sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyRegressor

from sklearn.tree import DecisionTreeRegressor, export_graphviz

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from imblearn.under_sampling import RandomUnderSampler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data

df = pd.read_csv('Data/df_final.csv',index_col = 0)

# drop the columns that includes only one value
# and categorical variables that has too many values
df = df.drop(columns = ['israteperstay', 
                        'ratetype',
                        'city'])

# convert to some binary features to bool
df['ispromo']  = (df['ispromo']== 'Y')

# Use one-hot encoding to transform some categorical features

df = pd.get_dummies(df, prefix=['propertytype'], columns=['propertytype'])
df = pd.get_dummies(df, prefix=['roomtype'], columns=['roomtype'])
df = pd.get_dummies(df, prefix=['country'], columns=['country'])

# convert the boolean terms into integers
df = df*1

In [3]:
X_dev = df.loc[:, df.columns != 'price']
y_dev = df.loc[:, df.columns == 'price']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_dev, 
                                                    y_dev, 
                                                    test_size=0.2,  
                                                    # 0.8 dev data vs 0.2 test data
                                                    random_state=123)

In [4]:
# Create a baseline regressor to examine advanced models
dummy_regr = DummyRegressor(strategy="median")

dummy_regr.fit(X_train, y_train)

train_mae = sm.mean_absolute_error(y_train, 
                                   dummy_regr.predict(X_train))

test_mae = sm.mean_absolute_error(y_test, 
                                  dummy_regr.predict(X_test))

print(f"The best training MAE is : {round(train_mae,3)}")
print(f"The best testing MAE is  : {round(test_mae,3)}")

The best training MAE is : 63.856
The best testing MAE is  : 63.046


In [5]:
tree_para = {'max_depth':[40,50,60,70,80],
             'max_features':[30,40,50,60,70,80]}

clf_dt = GridSearchCV(DecisionTreeRegressor(random_state = 123), 
                   tree_para, 
                   cv=5)

clf_dt.fit(X_train, y_train)

train_mae = sm.mean_absolute_error(y_train, 
                                   clf_dt.best_estimator_.predict(X_train))

test_mae = sm.mean_absolute_error(y_test, 
                                  clf_dt.best_estimator_.predict(X_test))

print(f"The optimal parameter is   : {clf_dt.best_params_}")
print(f"The best training MAE is : {round(train_mae,3)}")
print(f"The best testing MAE is  : {round(test_mae,3)}")

The optimal parameter is   : {'max_depth': 40, 'max_features': 30}
The best training MAE is : 9.165
The best testing MAE is  : 24.758


In [22]:
feature_importance = clf_dt.best_estimator_.feature_importances_

k = 10

idx = np.argpartition(feature_importance, k)

# drop the 20 least important features
X_train_simplified = X_train.drop(columns = X_train.columns[idx[:k]])

X_test_simplified = X_test.drop(columns = X_train.columns[idx[:k]])


In [23]:
tree_para = {'max_depth':[40,50,60,70,80],
             'max_features':[30,40,50,60,70,80]}

clf_dt = GridSearchCV(DecisionTreeRegressor(random_state = 123), 
                   tree_para, 
                   cv=5)

clf_dt.fit(X_train_simplified, y_train)

train_mae = sm.mean_absolute_error(y_train, 
                                   clf_dt.best_estimator_.predict(X_train_simplified))

test_mae = sm.mean_absolute_error(y_test, 
                                  clf_dt.best_estimator_.predict(X_test_simplified))

print(f"The optimal parameter is   : {clf_dt.best_params_}")
print(f"The best training MAE is : {round(train_mae,3)}")
print(f"The best testing MAE is  : {round(test_mae,3)}")

The optimal parameter is   : {'max_depth': 50, 'max_features': 60}
The best training MAE is : 9.065
The best testing MAE is  : 24.519


In [24]:
tree_para = {'max_depth':[40,50,60,70,80],
             'max_features':['auto', 'sqrt', 'log2'],
             'bootstrap':[True, False]
             }

clf = GridSearchCV(RandomForestRegressor(random_state = 123), 
                   tree_para, 
                   cv=5)

clf.fit(X_train_simplified, y_train)

train_mae = sm.mean_absolute_error(y_train, 
                                   clf.best_estimator_.predict(X_train_simplified))

test_mae = sm.mean_absolute_error(y_test, 
                                  clf.best_estimator_.predict(X_test_simplified))

print(f"The optimal parameter is   : {clf.best_params_}")
print(f"The best training MAE is : {round(train_mae,3)}")
print(f"The best testing MAE is  : {round(test_mae,3)}")

KeyboardInterrupt: 

In [None]:
tree_para = {'n_estimators':[50,100,150,200],
             'max_depth':[20,30,40,50],
             }

clf = GridSearchCV(GradientBoostingRegressor(random_state = 123), 
                   tree_para, 
                   cv=5)

clf.fit(X_train, y_train)

print(f"The optimal parameter is   : {clf.best_params_}")
print(f"The best training score is : {clf.best_score_}")
print(f"The best testing score is  : {clf.best_estimator_.score(X_test, y_test)}")

In [None]:
tree_para = {'n_estimators':[100, 150, 200, 250], 
             'max_depth':[10,12,15,20,25], 
             'eta':[0.01, 0.02, 0.05, 0.1]
             }

clf = GridSearchCV(xgboost.XGBRegressor(random_state = 123,
                                        verbosity = 0), 
                   tree_para, 
                   cv=5)

clf.fit(X_train, y_train)

train_mae = sm.mean_absolute_error(y_train, 
                                   clf.best_estimator_.predict(X_train))

test_mae = sm.mean_absolute_error(y_test, 
                                  clf.best_estimator_.predict(X_test))

print(f"The optimal parameter is   : {clf.best_params_}")
print(f"The best training MAE is : {round(train_mae,3)}")
print(f"The best testing MAE is  : {round(test_mae,3)}")
