In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import FeatureHasher

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
data = pd.read_csv("data_processed.csv")

In [3]:
pd.set_option('display.max_columns', 50)
data

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,neighbourhood_1,neighbourhood_2,neighbourhood_3
0,40.64749,-73.97237,149,1,9,0.21,6,365,0,1,0,0,1,0,0,0,0.0,0.0,1.0
1,40.75362,-73.98377,225,1,45,0.38,2,355,1,0,0,0,0,1,0,0,0.0,0.0,1.0
2,40.80902,-73.94190,150,3,0,0.00,1,365,0,1,0,0,0,1,0,0,-1.0,0.0,0.0
3,40.68514,-73.95976,89,1,270,4.64,1,194,1,0,0,0,1,0,0,0,1.0,0.0,0.0
4,40.79851,-73.94399,80,10,9,0.10,1,0,1,0,0,0,0,1,0,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47425,40.67853,-73.94995,70,2,0,0.00,2,9,0,1,0,0,1,0,0,0,1.0,0.0,0.0
47426,40.70184,-73.93317,40,4,0,0.00,2,36,0,1,0,0,1,0,0,0,0.0,-1.0,0.0
47427,40.81475,-73.94867,115,10,0,0.00,1,27,1,0,0,0,0,1,0,0,-1.0,0.0,0.0
47428,40.75751,-73.99112,55,1,0,0.00,6,2,0,0,1,0,0,1,0,0,-1.0,0.0,0.0


### Adaboost

In [5]:
#split the data
X = data.select_dtypes(exclude='object').drop(['price'], axis=1)
y = data['price'].to_list()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the AdaBoost classifier and set the hyperparameters
adaboost = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth = 5, min_samples_split = 4, min_samples_leaf = 12),
                             n_estimators=50,
                             learning_rate=0.01,
                             loss = 'linear',
                             random_state=42)

# Fit the model to the training data
adaboost.fit(X_train, y_train)

# Evaluate the performance of the model on the testing data
y_pred = adaboost.predict(X_test)

#calculate performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Absolute Error (MAE): {:.2f}".format(mae))
print("Root Mean Squared Error (RMSE): {:.2f}".format(rmse))
print("R^2 score: {:.2f}".format(r2))

Mean Absolute Error (MAE): 41.47
Root Mean Squared Error (RMSE): 59.79
R^2 score: 0.47


#### Random search

In [10]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
  'n_estimators': [50, 100],
  'learning_rate' : [0.01,0.05,0.1,0.3,1],
  'loss' : ['linear', 'square', 'exponential'],
  'base_estimator__max_depth': [2, 3, 4, 5],
  'base_estimator__min_samples_split': [2, 3, 4],
  'base_estimator__min_samples_leaf' : [10, 15, 20]
 }

pre_gs_inst = RandomizedSearchCV(AdaBoostRegressor(base_estimator=DecisionTreeRegressor(random_state=42)),
 param_distributions = param_dist,
 cv=3,
 n_iter = 10,
 n_jobs=-1)

pre_gs_inst.fit(X_train, y_train)



In [12]:
pre_gs_inst.best_params_

{'n_estimators': 50,
 'loss': 'exponential',
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_samples_leaf': 15,
 'base_estimator__max_depth': 5}

Random search tests a random set of hyperparameters from the search space. In this context, it is prefered to grid search cv, as we care about computation time and as the number of hyperparameters to test and data size are important (grid search cv takes hours to run when we tried implementing it). It points towards a possible combination of optimal hyperparameters, but it may not be the optimal one. We started by inputing the hyperparameters found by the random search and modified some of them by hand until we could observe the lowest RMSE. This is why the output of the random search and the final model's hyperparameters do not math perfectly. 