In [247]:
import pandas as pd
import os 
import numpy as np
path = os.getcwd()
nyc = pd.read_csv(path +'/data/nyc.csv',sep=';')
airbnb = pd.read_csv(path + '/data/nyc_airbnb.csv',low_memory=False)

**Data Exploration**

In [155]:
nyc.head()

Unnamed: 0.1,Unnamed: 0,Tourist_Spot,Address,Zipcode,Lat,Long,Rating
0,0,47th Street - The Diamond District,"47th St. betw. Fifth & Sixth Aves. Manhattan, ...",10036,40.757344,-73.980446,5.0
1,1,9/11 Memorial & Museum,World Trade Center (museum: 180 Greenwich St.)...,10007,40.711415,-74.012479,4.8
2,2,Abyssinian Baptist Church,"132 Odell Clark Place New York, NY 10030, Harlem",10030,40.816679,-73.94145,0.0
3,3,Alice Austen House Museum,"2 Hylan Blvd. Staten Island, NY 10305, Staten ...",10305,40.615203,-74.063071,4.6
4,4,Alice Tully Hall,"1941 Broadway Manhattan, NY 10023, Upper West ...",10023,40.7735,-73.98279,4.8


In [156]:
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,150,30,48,2019-11-04,0.33,3,338,0,
1,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Bedford-Stuyvesant,40.68494,-73.95765,Entire home/apt,75,1,409,2021-10-22,4.86,1,194,32,
2,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,60,30,50,2016-06-05,0.52,2,365,0,
3,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.66265,-73.99454,Entire home/apt,275,5,2,2021-08-08,0.02,1,123,1,
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76457,-73.98317,Private room,68,2,507,2021-11-08,3.68,1,192,33,


In [179]:
airbnb.isna()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,neighbourhood-info
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38272,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True
38273,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True
38274,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False
38275,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False


Some values are NaN for reviews_per_month. I offer to replace these values with the mean of all values in the dataset. For neighbourhood-info NaN values we can drop them as there are very few values that are NaN. For last_review NaN values we won't be using it in a first approach.

In [157]:
cols = ['neighbourhood_group','neighbourhood','room_type']
for col in cols :
    print(airbnb[col].unique().shape)

(5,)
(222,)
(4,)


I noticed that there were many different neighbourhood. Some of these might be good to help predict the price but some may not be. Need to figure which ones are likely to help predict the price. For this we need datas thus if a neighbourhood is not really represented in the dataset it is better to not take it into account in our predictions.

In [158]:
#license is NaN for all values thus we can drop it
airbnb = airbnb.drop(['license'],axis=1)

In [159]:
s = airbnb.groupby('neighbourhood').count()
neighbourhood = s.where(s['id']>=50).dropna().index

In [160]:
neighbourhood.shape

(90,)

We can reduce the number of neighbourhood to 90 or even less if we chose to change the threshold of 50 later on.

In [161]:
replace = {i:i+'_neighbourhood' for i in neighbourhood}

airbnb['neighbourhood-info'] = airbnb['neighbourhood'].map(replace)   

In [162]:
airbnb['neighbourhood-info'].unique().shape

(91,)

One Hot Encoding our 'neighbourhood', 'neighbourhood-group' and 'room_type' for learning

In [163]:

cols = ['neighbourhood-info','neighbourhood_group','room_type']
new_airbnb = pd.DataFrame()
for col in cols :
    one_hot = pd.get_dummies(airbnb[col], drop_first = True)
    new_airbnb = pd.concat([new_airbnb,one_hot],axis=1)

In [164]:
airbnb.shape

(38277, 18)

In [165]:
new_airbnb.columns

Index(['Astoria_neighbourhood', 'Battery Park City_neighbourhood',
       'Bay Ridge_neighbourhood', 'Bedford-Stuyvesant_neighbourhood',
       'Bensonhurst_neighbourhood', 'Boerum Hill_neighbourhood',
       'Borough Park_neighbourhood', 'Brighton Beach_neighbourhood',
       'Brooklyn Heights_neighbourhood', 'Brownsville_neighbourhood',
       'Bushwick_neighbourhood', 'Canarsie_neighbourhood',
       'Carroll Gardens_neighbourhood', 'Chelsea_neighbourhood',
       'Chinatown_neighbourhood', 'Clinton Hill_neighbourhood',
       'Cobble Hill_neighbourhood', 'Corona_neighbourhood',
       'Crown Heights_neighbourhood', 'Cypress Hills_neighbourhood',
       'Ditmars Steinway_neighbourhood', 'Downtown Brooklyn_neighbourhood',
       'East Elmhurst_neighbourhood', 'East Flatbush_neighbourhood',
       'East Harlem_neighbourhood', 'East New York_neighbourhood',
       'East Village_neighbourhood', 'Elmhurst_neighbourhood',
       'Financial District_neighbourhood', 'Flatbush_neighbourhood'

In [166]:
cols = ['id','name','host_id','host_name','latitude','longitude','price','minimum_nights','number_of_reviews','last_review','reviews_per_month','calculated_host_listings_count','availability_365','number_of_reviews_ltm']
for col in cols :
    new_airbnb[col] = airbnb[col]

new_airbnb now contains all the informations necessary for learning how pricing works.
In a first approach we will use everything we have.

In [182]:
from sklearn.model_selection import train_test_split
X = new_airbnb.dropna().drop(['id','name','host_id','host_name','price','last_review'],axis = 1)
y = new_airbnb.dropna()['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [183]:
X_train.shape

(22997, 104)

In [184]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X_train,y_train)

In [197]:
def MSE (pred,label): 
    return 1/len(label)*sum((pred-label)**2)
def OSR2(pred,label) :
    return 1-MSE(pred,label)/MSE(pred.mean(),label)

In [200]:
print(OSR2(lin_reg.predict(X_test),y_test))


0.0695955957756701


In [248]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

grid_values = {'ccp_alpha': np.linspace(0.0001, 0.001, 10)}

cart = DecisionTreeRegressor(random_state=88)
dtc_cv = GridSearchCV(cart, param_grid=grid_values, cv=5).fit(X_train, y_train)

In [246]:
print(OSR2(dtc_cv.predict(X_test),y_test))


0.0756270581492251


In [229]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train,y_train)

In [230]:
print(OSR2(rf.predict(X_test),y_test))


0.055247302399529574


In [237]:
from sklearn.ensemble import BaggingRegressor

bg = BaggingRegressor(rf,n_estimators = 5)
bg.fit(X_train,y_train)

In [236]:
print(OSR2(bg.predict(X_test),y_test))


0.06940311068808591


In [256]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

grid_values_2 = {'ccp_alpha': np.linspace(0, 0.001, 51)}

dtr_2 = DecisionTreeRegressor(min_samples_leaf=5, min_samples_split=2000, random_state=88)
dtr_cv_2 = GridSearchCV(dtr_2, param_grid=grid_values, scoring='r2', cv=5, verbose=0)
dtr_cv_2.fit(X_train, y_train)

In [257]:
print(OSR2(dtr_cv_2.predict(X_test),y_test))

0.0756270581492251
