In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras

In [2]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,1,1,1,0,-0.221989,0.879947,0.206922,1.031946,-0.417068,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,1,1,0,1,-0.248121,0.883912,0.178912,1.034251,-0.425829,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,1,1,1,1,-0.252342,0.869698,0.126796,-0.607644,-0.445747,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,1,1,1,0,0.187295,0.883912,0.178912,-0.976832,2.696289,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,1,1,1,1,-0.242856,0.875005,0.145414,-0.634193,1.380088,-0.101872,-0.095776


In [3]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
X = df[features]
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train With XGB

In [4]:
#Uncomment if first time run
# %pip install xgboost

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#hyperparam may need here
param = {
    'max_depth': 5,
    'eta': 0.3,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}


In [6]:
num_round = 100
model = xgb.train(param, dtrain, num_round, evals=[(dtrain, 'train')], early_stopping_rounds=10)

[0]	train-rmse:0.59756
[1]	train-rmse:0.52790
[2]	train-rmse:0.48754
[3]	train-rmse:0.46457
[4]	train-rmse:0.45096
[5]	train-rmse:0.44295
[6]	train-rmse:0.43715
[7]	train-rmse:0.43387
[8]	train-rmse:0.43124
[9]	train-rmse:0.42862
[10]	train-rmse:0.42670
[11]	train-rmse:0.42536
[12]	train-rmse:0.42425
[13]	train-rmse:0.42332
[14]	train-rmse:0.42223
[15]	train-rmse:0.42129
[16]	train-rmse:0.42052
[17]	train-rmse:0.41994
[18]	train-rmse:0.41965
[19]	train-rmse:0.41877
[20]	train-rmse:0.41845
[21]	train-rmse:0.41785
[22]	train-rmse:0.41738
[23]	train-rmse:0.41666
[24]	train-rmse:0.41623
[25]	train-rmse:0.41577
[26]	train-rmse:0.41535
[27]	train-rmse:0.41504
[28]	train-rmse:0.41462
[29]	train-rmse:0.41409
[30]	train-rmse:0.41378
[31]	train-rmse:0.41342
[32]	train-rmse:0.41310
[33]	train-rmse:0.41287
[34]	train-rmse:0.41255
[35]	train-rmse:0.41224
[36]	train-rmse:0.41183
[37]	train-rmse:0.41136
[38]	train-rmse:0.41083
[39]	train-rmse:0.41066
[40]	train-rmse:0.41045
[41]	train-rmse:0.41029
[4

In [7]:
# model = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Training error
y_pred = model.predict(dtrain)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("Train RMSE: %f" % (rmse))
print("Train MSE: %f" % (mse))

# Test error
y_pred = model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %f" % (rmse))
print("Test MSE: %f" % (mse))

Train RMSE: 0.396731
Train MSE: 0.157395
Test RMSE: 0.426739
Test MSE: 0.182106


## Train with LGBM

In [9]:
import lightgbm as lgb

# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',  # or 'regression' for regression
    'metric': 'l2',  # or 'l2' for regression
    'num_leaves': 31,
    'learning_rate': 0.05
}

# Train model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], callbacks=[lgb.early_stopping(10)])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 820
[LightGBM] [Info] Number of data points in the train set: 59137, number of used features: 16
[LightGBM] [Info] Start training from score 4.781865
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.183496


In [10]:
# Predictions
# Test error
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %f" % (rmse))
print("Test MSE: %f" % (mse))

Test RMSE: 0.428365
Test MSE: 0.183496
