In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras

In [8]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,1,1,1,0,-0.221989,0.879947,0.206922,1.031946,-0.417068,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,1,1,0,1,-0.248121,0.883912,0.178912,1.034251,-0.425829,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,1,1,1,1,-0.252342,0.869698,0.126796,-0.607644,-0.445747,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,1,1,1,0,0.187295,0.883912,0.178912,-0.976832,2.696289,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,1,1,1,1,-0.242856,0.875005,0.145414,-0.634193,1.380088,-0.101872,-0.095776


In [9]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
X = df[features]
y = df['log_price']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

### Train With XGB

In [10]:
#Uncomment if first time run
# %pip install xgboost

In [11]:
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error

# dtrain = xgb.DMatrix(X_train, label=y_train)
# dval = xgb.DMatrix(X_val, label=y_val)
# dtest = xgb.DMatrix(X_test, label=y_test)

# #hyperparam may need here
# param = {
#     'max_depth': 6,
#     'eta': 0.1,
#     'objective': 'reg:squarederror',
#     'eval_metric': 'rmse'
# }


In [13]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Define a wide range of hyperparameters for tuning
param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'eta': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
}

# Create an XGBoost regressor object
xgb_model = xgb.XGBRegressor(n_estimators=250, random_state=42,objective='reg:squarederror', eval_metric='rmse',early_stopping_rounds=10)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=False, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_full, y_train_full, eval_set=[(X_val, y_val)], verbose=False)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Calculate the metrics
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print("Best parameters found: ", grid_search.best_params_)
print("Train MSE: ", mse_train)
print("Test MSE: ", mse_test)


Best parameters found:  {'colsample_bytree': 0.6, 'eta': 0.05, 'max_depth': 7, 'subsample': 0.8}
Train MSE:  0.1490166415744588
Test MSE:  0.17929578089980588


In [14]:
# num_round = 150
# evals = [(dtrain, 'train'), (dval, 'eval')]
# model = xgb.train(param, dtrain, num_round, evals=evals, early_stopping_rounds=10)

In [None]:
# # model = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

# # Training error
# y_pred_train = model.predict(dtrain)
# mse_train = mean_squared_error(y_train, y_pred_train)
# rmse_train = np.sqrt(mse_train)
# print("Train RMSE: %f" % rmse_train)
# print("Train MSE: %f" % mse_train)

# # Test error
# y_pred_test = model.predict(dtest)
# mse_test = mean_squared_error(y_test, y_pred_test)
# rmse_test = np.sqrt(mse_test)
# print("Test RMSE: %f" % rmse_test)
# print("Test MSE: %f" % mse_test)

Train RMSE: 0.397022
Train MSE: 0.157627
Test RMSE: 0.425769
Test MSE: 0.181279


## Train with LGBM

In [None]:
import lightgbm as lgb

# Prepare datasets
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val)


# Define parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',  # or 'regression' for regression
    'metric': 'l2',  # or 'l2' for regression
    'num_leaves': 31,
    'learning_rate': 0.05
}

# Train model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(10)])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 811
[LightGBM] [Info] Number of data points in the train set: 44352, number of used features: 16
[LightGBM] [Info] Start training from score 4.784914
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.179938


In [None]:
# Predictions
# Test error
y_pred_test = bst.predict(X_test, num_iteration=bst.best_iteration)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
print("Test RMSE: %f" % rmse_test)
print("Test MSE: %f" % mse_test)

Test RMSE: 0.428728
Test MSE: 0.183807


### XGBoost Model K-Fold Cross-Validation

We performed K-fold cross-validation on our dataset using the XGBoost algorithm to predict Airbnb listing prices. This approach helps us to understand how well our model generalizes on unseen data by dividing the dataset into `k` distinct subsets (or folds), then iteratively training the model on `k-1` subsets while using the remaining subset for validation. The process is repeated `k` times, with each subset serving as the validation set exactly once.

#### Cross-Validation Parameters
- **Objective**: Regression with squared error.
- **Max Depth**: 5 layers to control the complexity of the model.
- **Eta (Learning Rate)**: 0.3 to control the model's learning rate.
- **Evaluation Metric**: Root Mean Squared Error (RMSE), a standard metric for regression tasks.

#### Results
After performing 5-fold cross-validation, the model demonstrated the following results:
- **Mean RMSE**: 0.42461, indicating the average error across all folds.
- **Standard Deviation of RMSE**: 0.00387, showing the variability in the RMSE across folds. This low standard deviation suggests that the model's performance is relatively consistent across different subsets of the data.

The detailed training process showed a steady decrease in RMSE from the initial rounds to the final iteration.The XGBoost model's performance, as evidenced by K-fold cross-validation, suggests it is a reliable and consistent approach for predicting Airbnb listing prices. The relatively low and stable RMSE across folds signifies good model generalization. Further optimization and testing may refine the model, potentially leading to even more accurate predictions.