In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras

In [2]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,True,True,True,False,-0.245313,0.876836,0.168067,1.021281,-0.434017,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,True,True,False,True,-0.245338,0.87683,0.168044,1.021245,-0.434042,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,True,True,True,True,-0.245305,0.876838,0.168074,-0.593873,-0.434009,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,True,True,True,False,0.157557,0.876747,0.167757,-1.013554,2.675098,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,True,True,True,True,-0.245255,0.876851,0.168119,-0.593754,1.412368,-0.101872,-0.095776


In [3]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
X = df[features]
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train With XGB

In [4]:
#Uncomment if first time run
# %pip install xgboost

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#hyperparam may need here
param = {
    'max_depth': 5,
    'eta': 0.3,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}


In [6]:
num_round = 100
model = xgb.train(param, dtrain, num_round, evals=[(dtrain, 'train')], early_stopping_rounds=10)

[0]	train-rmse:0.50961
[1]	train-rmse:0.36632
[2]	train-rmse:0.26731
[3]	train-rmse:0.19940
[4]	train-rmse:0.15314
[5]	train-rmse:0.12297
[6]	train-rmse:0.10383
[7]	train-rmse:0.09098
[8]	train-rmse:0.08274
[9]	train-rmse:0.07665
[10]	train-rmse:0.07200
[11]	train-rmse:0.06952
[12]	train-rmse:0.06581
[13]	train-rmse:0.05927
[14]	train-rmse:0.05595
[15]	train-rmse:0.05327
[16]	train-rmse:0.05019
[17]	train-rmse:0.04887
[18]	train-rmse:0.04657
[19]	train-rmse:0.04456
[20]	train-rmse:0.04345
[21]	train-rmse:0.04226
[22]	train-rmse:0.04142
[23]	train-rmse:0.03989
[24]	train-rmse:0.03841
[25]	train-rmse:0.03767
[26]	train-rmse:0.03656
[27]	train-rmse:0.03623
[28]	train-rmse:0.03469
[29]	train-rmse:0.03327
[30]	train-rmse:0.03242
[31]	train-rmse:0.03179
[32]	train-rmse:0.03110
[33]	train-rmse:0.03082
[34]	train-rmse:0.02975
[35]	train-rmse:0.02934
[36]	train-rmse:0.02877
[37]	train-rmse:0.02865
[38]	train-rmse:0.02779
[39]	train-rmse:0.02767
[40]	train-rmse:0.02738
[41]	train-rmse:0.02698
[4

In [7]:
# model = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Training error
y_pred = model.predict(dtrain)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("Train RMSE: %f" % (rmse))
print("Train MSE: %f" % (mse))

# Test error
y_pred = model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %f" % (rmse))
print("Test MSE: %f" % (mse))

Train RMSE: 0.015486
Train MSE: 0.000240
Test RMSE: 0.031724
Test MSE: 0.001006


## Train with LGBM

In [8]:
import lightgbm as lgb

# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',  # or 'regression' for regression
    'metric': 'l2',  # or 'l2' for regression
    'num_leaves': 31,
    'learning_rate': 0.05
}

# Train model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1855
[LightGBM] [Info] Number of data points in the train set: 59137, number of used features: 12
[LightGBM] [Info] Start training from score 4.781865
[1]	valid_0's l2: 0.474588
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.428802
[3]	valid_0's l2: 0.387527
[4]	valid_0's l2: 0.350237
[5]	valid_0's l2: 0.316455
[6]	valid_0's l2: 0.285956
[7]	valid_0's l2: 0.258404
[8]	valid_0's l2: 0.233542
[9]	valid_0's l2: 0.211077
[10]	valid_0's l2: 0.190713
[11]	valid_0's l2: 0.172372
[12]	valid_0's l2: 0.155794
[13]	valid_0's l2: 0.140821
[14]	valid_0's l2: 0.127341
[15]	valid_0's l2: 0.11512
[16]	valid_0's l2: 0.104056
[17]	valid_0's l2: 0.0940511
[18]	valid_0's l2: 0.0850155
[19]	valid_0's l2: 0.0768436
[20]	valid_0's l2: 0.0695232
[21]	valid_0's l2: 0.0628523
[22]	valid_0's l2: 0.0568537
[23]	valid_0's l2: 0.0514

In [12]:
# Predictions
# Test error
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %f" % (rmse))
print("Test MSE: %f" % (mse))

Test RMSE: 0.018050
Test MSE: 0.000326
