# Import data

https://www.kaggle.com/questions-and-answers/214333


In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"yunqiuqiu","key":"9b3eab6c309fdfee027552f2a6c46534"}'}

In [None]:
!pip install opendatasets --upgrade --quiet


In [None]:
import opendatasets as od

dataset_url = 'https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data'
od.download(dataset_url)

Downloading new-york-city-airbnb-open-data.zip to ./new-york-city-airbnb-open-data


100%|██████████| 2.44M/2.44M [00:00<00:00, 3.99MB/s]







# Data prepocessing

In [None]:
# Load necessary library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

from wordcloud import WordCloud

%matplotlib inline

# set default plot size
plt.rcParams["figure.figsize"] = (15,8)

In [None]:
# Load and preview data 
ab_nyc = pd.read_csv("/content/new-york-city-airbnb-open-data/AB_NYC_2019.csv")
ab_nyc.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [None]:
# drop id and name columns
ab_nyc.drop(['id','name','host_id','host_name'],axis=1,inplace = True)
ab_nyc.describe()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [None]:
# Check each column for nas
ab_nyc.isnull().sum()

neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

Then we want to **remove the outliers** for **price and minimum_nights** column. we calculate z score for both column and remove all records that have a z score greater than 3.

In [None]:
# remove outliers for price and minimun nights column

from scipy import stats

ab_nyc['z_price'] = np.abs(stats.zscore(ab_nyc['price']))
ab_nyc['z_min_nights'] = np.abs(stats.zscore(ab_nyc['minimum_nights']))
ab_nyc.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,z_price,z_min_nights
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,0.015493,0.293996
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,0.300974,0.293996
2,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365,0.011329,0.196484
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,0.265335,0.293996
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,0.302811,0.144807


In [None]:
# remove z scroe that are greater than 3

ab_nyc_final = ab_nyc[(ab_nyc['z_price'] < 3)]
ab_nyc_final = ab_nyc_final[(ab_nyc['z_min_nights'] < 3)]

For the columns that are h**ighly skewed**, for example **number_of_reviews and calculated_host_listings_count**, we transfer them into categorical variable.

Based on the summary statistics, we can see that the first 25 percentile of minimum_nights are 1, the median is 3 and the 75% percentile is 5, an reasonable categorization would be one night, two nights, three nights, four nights and five nights or more. Then we check we have enough and evenly distributed values in each of the group using groupby() and size().

In [None]:
# convert numneric variables into categorical variables

ab_nyc_final['minimum_nights_group'] = 'Others'
ab_nyc_final['minimum_nights_group'][ab_nyc_final['minimum_nights'] == 1] = 'one night'
ab_nyc_final['minimum_nights_group'][ab_nyc_final['minimum_nights'] == 2] = 'two nights'
ab_nyc_final['minimum_nights_group'][ab_nyc_final['minimum_nights'] == 3] = 'three nights'
ab_nyc_final['minimum_nights_group'][ab_nyc_final['minimum_nights'] == 4] = 'four nights'
ab_nyc_final['minimum_nights_group'][ab_nyc_final['minimum_nights'] > 4] = 'five nights or more'

In [None]:
# ab_nyc_final.groupby('minimum_nights_group').size()

ab_nyc_final['calculated_host_listings_count_group'] = 'Others'
ab_nyc_final['calculated_host_listings_count_group'][ab_nyc_final['calculated_host_listings_count'] == 1] = 'one listing'
ab_nyc_final['calculated_host_listings_count_group'][ab_nyc_final['calculated_host_listings_count'] == 2] = 'two listings'
ab_nyc_final['calculated_host_listings_count_group'][ab_nyc_final['calculated_host_listings_count'] > 2] = 'more than two listings'

In [None]:
# remove unused columns
ab_nyc_final.drop(['z_price','z_min_nights','minimum_nights','last_review','neighbourhood',
                   'calculated_host_listings_count','reviews_per_month'],
                  axis = 1,inplace = True)
ab_nyc_final.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,number_of_reviews,availability_365,minimum_nights_group,calculated_host_listings_count_group
0,Brooklyn,40.64749,-73.97237,Private room,149,9,365,one night,more than two listings
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,45,355,one night,two listings
2,Manhattan,40.80902,-73.9419,Private room,150,0,365,three nights,one listing
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,270,194,one night,one listing
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,9,0,five nights or more,one listing


In [None]:
ab_nyc_model = ab_nyc_final.drop(['latitude','longitude'],axis = 1)
ab_nyc_model.head()

Unnamed: 0,neighbourhood_group,room_type,price,number_of_reviews,availability_365,minimum_nights_group,calculated_host_listings_count_group
0,Brooklyn,Private room,149,9,365,one night,more than two listings
1,Manhattan,Entire home/apt,225,45,355,one night,two listings
2,Manhattan,Private room,150,0,365,three nights,one listing
3,Brooklyn,Entire home/apt,89,270,194,one night,one listing
4,Manhattan,Entire home/apt,80,9,0,five nights or more,one listing


In [None]:
# Building the model
# first convert categorical variables to dummy variables using one hot encoding

categorical_var = ['neighbourhood_group','room_type','minimum_nights_group','calculated_host_listings_count_group']

# create dummy variables for all the other categorical variables

for variable in categorical_var:
# #     fill missing data
#     recruit[variable].fillna('Missing',inplace=True)
#     create dummy variables for given columns
    dummies = pd.get_dummies(ab_nyc_model[variable],prefix=variable)
#     update data and drop original columns
    ab_nyc_model = pd.concat([ab_nyc_model,dummies],axis=1)
    ab_nyc_model.drop([variable],axis=1,inplace=True)

ab_nyc_model.head()

Unnamed: 0,price,number_of_reviews,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,minimum_nights_group_five nights or more,minimum_nights_group_four nights,minimum_nights_group_one night,minimum_nights_group_three nights,minimum_nights_group_two nights,calculated_host_listings_count_group_more than two listings,calculated_host_listings_count_group_one listing,calculated_host_listings_count_group_two listings
0,149,9,365,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
1,225,45,355,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1
2,150,0,365,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0
3,89,270,194,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,80,9,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0


In [None]:
x = ab_nyc_model.drop(['price'], axis=1)
y = ab_nyc_model['price'].astype(float)

# split train and test dataset
train_x, test_x, train_y, test_y = train_test_split(x,y , test_size=0.3, random_state=42)

print(train_x.shape)
print(train_y.shape)

print(test_x.shape)
print(test_y.shape)

(33737, 18)
(33737,)
(14460, 18)
(14460,)


# Randome Forest

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100,random_state=0)
rf_regressor.fit(train_x,train_y)

In [None]:
rf_regressor.score(train_x,train_y)

0.7416102080516221

# Baggings

https://www.kaggle.com/code/olgabelitskaya/regressors/notebook

In [None]:
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor,RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
ba_regressor = BaggingRegressor()
ba_regressor.fit(train_x,train_y)

In [None]:
ba_regressor.score(train_x,train_y)

0.7163530776976836

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.8, 1.0],
    'max_features': [0.5, 0.8, 1.0]
}

In [None]:
grid_search = GridSearchCV(ba_regressor, param_grid=param_grid, cv=5)

In [None]:
grid_search.fit(train_x,train_y)

for i, score in enumerate(grid_search.cv_results_['mean_test_score']):
    print(f"Fold {i+1} - Mean score: {score:.4f}")

Fold 1 - Mean score: 0.3170
Fold 2 - Mean score: 0.3433
Fold 3 - Mean score: 0.3465
Fold 4 - Mean score: 0.3124
Fold 5 - Mean score: 0.3448
Fold 6 - Mean score: 0.3460
Fold 7 - Mean score: 0.3201
Fold 8 - Mean score: 0.3407
Fold 9 - Mean score: 0.3447
Fold 10 - Mean score: 0.3316
Fold 11 - Mean score: 0.3674
Fold 12 - Mean score: 0.3709
Fold 13 - Mean score: 0.3343
Fold 14 - Mean score: 0.3593
Fold 15 - Mean score: 0.3631
Fold 16 - Mean score: 0.3293
Fold 17 - Mean score: 0.3566
Fold 18 - Mean score: 0.3581
Fold 19 - Mean score: 0.3091
Fold 20 - Mean score: 0.3461
Fold 21 - Mean score: 0.3493
Fold 22 - Mean score: 0.2872
Fold 23 - Mean score: 0.3210
Fold 24 - Mean score: 0.3235
Fold 25 - Mean score: 0.2717
Fold 26 - Mean score: 0.3047
Fold 27 - Mean score: 0.3088


In [None]:
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'max_features': 0.8, 'max_samples': 0.5, 'n_estimators': 100}


In [None]:
#from sklearn.model_selection import GridSearchCV

#param_grid_br=\
{'base_estimator':[DecisionTreeRegressor(),RandomForestRegressor(),GradientBoostingRegressor()],
'n_estimators':range(90,151,30)}


#gridsearch_br=GridSearchCV(BaggingRegressor(),param_grid_br,n_jobs=5).fit(train_x,train_y)


#gridsearch_br.best_params_


In [None]:
# final model using the parameter tuning
ba_regressor = BaggingRegressor(n_estimators=100,max_features=0.8,max_samples=0.5)
ba_regressor.fit(train_x,train_y)
ba_regressor.score(train_x,train_y)

0.6116962603755404