In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras

In [2]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,1,1,1,0,-0.245313,0.876836,0.168067,1.021281,-0.434017,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,1,1,0,1,-0.245338,0.87683,0.168044,1.021245,-0.434042,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,1,1,1,1,-0.245305,0.876838,0.168074,-0.593873,-0.434009,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,1,1,1,0,0.157557,0.876747,0.167757,-1.013554,2.675098,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,1,1,1,1,-0.245255,0.876851,0.168119,-0.593754,1.412368,-0.101872,-0.095776


In [3]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
X = df[features]
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train With XGB

In [4]:
#Uncomment if first time run
#%pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#hyperparam may need here
param = {
    'max_depth': 5,
    'eta': 0.3,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}


In [5]:
num_round = 100
model = xgb.train(param, dtrain, num_round, evals=[(dtrain, 'train')], early_stopping_rounds=10)
y_pred = model.predict(dtrain)

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("Train RMSE: %f" % (rmse))
print("Train MSE: %f" % (mse))

[0]	train-rmse:0.50961
[1]	train-rmse:0.36632
[2]	train-rmse:0.26731
[3]	train-rmse:0.19940
[4]	train-rmse:0.15314
[5]	train-rmse:0.12297
[6]	train-rmse:0.10383
[7]	train-rmse:0.09098
[8]	train-rmse:0.08274
[9]	train-rmse:0.07665
[10]	train-rmse:0.07200
[11]	train-rmse:0.06952
[12]	train-rmse:0.06581
[13]	train-rmse:0.05927
[14]	train-rmse:0.05595
[15]	train-rmse:0.05327
[16]	train-rmse:0.05019
[17]	train-rmse:0.04887
[18]	train-rmse:0.04657
[19]	train-rmse:0.04456
[20]	train-rmse:0.04345
[21]	train-rmse:0.04226
[22]	train-rmse:0.04142
[23]	train-rmse:0.03989
[24]	train-rmse:0.03841
[25]	train-rmse:0.03767
[26]	train-rmse:0.03656
[27]	train-rmse:0.03623
[28]	train-rmse:0.03469
[29]	train-rmse:0.03327
[30]	train-rmse:0.03255
[31]	train-rmse:0.03113
[32]	train-rmse:0.03093
[33]	train-rmse:0.03066
[34]	train-rmse:0.03012
[35]	train-rmse:0.02890
[36]	train-rmse:0.02830
[37]	train-rmse:0.02731
[38]	train-rmse:0.02711
[39]	train-rmse:0.02680
[40]	train-rmse:0.02648
[41]	train-rmse:0.02629
[4

In [6]:
model = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

y_pred = model.predict(dtest)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %f" % (rmse))
print("Test MSE: %f" % (mse))

[0]	test-rmse:0.51634
[1]	test-rmse:0.37144
[2]	test-rmse:0.27113
[3]	test-rmse:0.20207
[4]	test-rmse:0.15424
[5]	test-rmse:0.12247
[6]	test-rmse:0.10135
[7]	test-rmse:0.08746
[8]	test-rmse:0.07965
[9]	test-rmse:0.07461
[10]	test-rmse:0.07116
[11]	test-rmse:0.06931
[12]	test-rmse:0.06680
[13]	test-rmse:0.05975
[14]	test-rmse:0.05648
[15]	test-rmse:0.05496
[16]	test-rmse:0.05259
[17]	test-rmse:0.05181
[18]	test-rmse:0.05007
[19]	test-rmse:0.04902
[20]	test-rmse:0.04801
[21]	test-rmse:0.04739
[22]	test-rmse:0.04735
[23]	test-rmse:0.04615
[24]	test-rmse:0.04505
[25]	test-rmse:0.04449
[26]	test-rmse:0.04441
[27]	test-rmse:0.04412
[28]	test-rmse:0.04267
[29]	test-rmse:0.04129
[30]	test-rmse:0.04085
[31]	test-rmse:0.03975
[32]	test-rmse:0.03980
[33]	test-rmse:0.03971
[34]	test-rmse:0.03973
[35]	test-rmse:0.03947
[36]	test-rmse:0.03943
[37]	test-rmse:0.03894
[38]	test-rmse:0.03881
[39]	test-rmse:0.03865
[40]	test-rmse:0.03852
[41]	test-rmse:0.03826
[42]	test-rmse:0.03796
[43]	test-rmse:0.0374