In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import pandas as pd

In [2]:
# importing data

# Study data files
file_path = "AirbnbData.csv"


# Read the mouse data and the study results
airbnb_df = pd.read_csv(file_path)

airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,10/19/2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,5/21/2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,7/5/2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,11/19/2018,0.1,1,0


In [3]:
# creating a copy of the dataset to work with in case anything happens
dataset = airbnb_df.copy()

# preview dataset
dataset.dropna(inplace=True)

dataset.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,10/19/2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,5/21/2019,0.38,2,355
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,7/5/2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,11/19/2018,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,6/22/2019,0.59,1,129


In [4]:
# dropping columns not needed
dataset = dataset.drop(columns=["id","name","host_id","host_name","last_review","neighbourhood"])

# droping index
dataset.reset_index(drop=True, inplace=True)

# previewing data
dataset.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
3,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0
4,Manhattan,40.74767,-73.975,Entire home/apt,200,3,74,0.59,1,129


In [5]:
# splitting dataset into x and y
x = dataset.drop(columns=["price"])
y = dataset["price"]

In [6]:
# Reformat data

# encoding categorical data
x = pd.get_dummies(x, prefix='', prefix_sep='')



# view shape of x data and preview
print(x.shape)
x.head()

(38821, 15)


Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Entire home/apt,Private room,Shared room
0,40.64749,-73.97237,1,9,0.21,6,365,0,1,0,0,0,0,1,0
1,40.75362,-73.98377,1,45,0.38,2,355,0,0,1,0,0,1,0,0
2,40.68514,-73.95976,1,270,4.64,1,194,0,1,0,0,0,1,0,0
3,40.79851,-73.94399,10,9,0.1,1,0,0,0,1,0,0,1,0,0
4,40.74767,-73.975,3,74,0.59,1,129,0,0,1,0,0,1,0,0


In [7]:
feature_names = x.columns

In [8]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
x_scaler = StandardScaler().fit(x_train)

In [10]:
# Transform the training and testing data using the X_scaler

x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

In [11]:
print(y_train.shape)
print(y_test.shape)

(29115,)
(9706,)


In [12]:
regr = RandomForestRegressor(max_depth=200, random_state=1)

regr.fit(x_train_scaled, y_train)

RandomForestRegressor(max_depth=200, random_state=1)

In [13]:
regr.score(x_test_scaled, y_test)

0.07384951142969798

In [14]:
sorted(zip(regr.feature_importances_, feature_names), reverse=True)

[(0.23650009300946195, 'longitude'),
 (0.1743653125895845, 'latitude'),
 (0.17097960267848572, 'minimum_nights'),
 (0.11068506165804569, 'availability_365'),
 (0.10523841505706961, 'reviews_per_month'),
 (0.08255269426092846, 'Entire home/apt'),
 (0.053820254528542916, 'number_of_reviews'),
 (0.04594619098608058, 'calculated_host_listings_count'),
 (0.007015786910757031, 'Brooklyn'),
 (0.006738056055691164, 'Queens'),
 (0.003986760636951048, 'Manhattan'),
 (0.0011522411703940147, 'Private room'),
 (0.0007906644360595462, 'Shared room'),
 (0.00011955462576350304, 'Bronx'),
 (0.00010931139618435675, 'Staten Island')]