# Vorbereitung

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "drive/My Drive/Python Programming Projekt/"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

In [23]:
# Read in the data as CSV and store it in the variable airbnb
airbnb = pd.read_csv(path + "AB_NYC_2019.csv",nrows=20000)
airbnb.head() #Shows the Top 5 entries in the dataset

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [24]:
# The features name & host_name are dropped, because they are not relevant for the model
# For creating an easier model, we also delete the variable last_review, otherwise it would have to many features
airbnb = airbnb.drop(["name","host_name","last_review","host_id","id","latitude","longitude","reviews_per_month",],axis=1)

# Datensatz transformieren

In [25]:
# Transform the object variables room_type & neighbourhood group into dummy variables
airbnb_raw = airbnb
airbnb = pd.get_dummies(airbnb, prefix=['room_type', 'neighbourhood_group','neighbourhood'], columns=['room_type','neighbourhood_group','neighbourhood'])

In [26]:
airbnb = airbnb.dropna()
airbnb_raw = airbnb_raw.dropna()

In [27]:
airbnb = airbnb.loc[airbnb.price <= 1000,:]
airbnb_raw = airbnb_raw.loc[airbnb_raw.price <= 1000,:]

# Vorhersagemodell

In [28]:
airbnb.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19902 entries, 0 to 19999
Data columns (total 213 columns):
 #    Column                                   Dtype
---   ------                                   -----
 0    price                                    int64
 1    minimum_nights                           int64
 2    number_of_reviews                        int64
 3    calculated_host_listings_count           int64
 4    availability_365                         int64
 5    room_type_Entire home/apt                uint8
 6    room_type_Private room                   uint8
 7    room_type_Shared room                    uint8
 8    neighbourhood_group_Bronx                uint8
 9    neighbourhood_group_Brooklyn             uint8
 10   neighbourhood_group_Manhattan            uint8
 11   neighbourhood_group_Queens               uint8
 12   neighbourhood_group_Staten Island        uint8
 13   neighbourhood_Allerton                   uint8
 14   neighbourhood_Arrochar              

In [29]:
y = airbnb["price"]
X = airbnb.drop("price", axis=1)

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=1)

model  = RandomForestRegressor()
forest = model.fit(X_train,y_train)

print("R2: ", forest.score(X_train, y_train))
R2_3 = forest.score(X_train, y_train)

y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
RMSE_3 = mean_squared_error(y_test, y_pred, squared=False)

R2:  0.786896988374592
RMSE: 93.00014524875974


In [31]:
airbnb.to_csv(path + "airbnb_prediction_data.csv", index=False)

In [None]:
# airbnb_raw.to_csv(path + "airbnb_data_dev.csv", index=False)

In [32]:
import pickle
filename = path + 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [33]:
airbnb.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19902 entries, 0 to 19999
Data columns (total 213 columns):
 #    Column                                   Dtype
---   ------                                   -----
 0    price                                    int64
 1    minimum_nights                           int64
 2    number_of_reviews                        int64
 3    calculated_host_listings_count           int64
 4    availability_365                         int64
 5    room_type_Entire home/apt                uint8
 6    room_type_Private room                   uint8
 7    room_type_Shared room                    uint8
 8    neighbourhood_group_Bronx                uint8
 9    neighbourhood_group_Brooklyn             uint8
 10   neighbourhood_group_Manhattan            uint8
 11   neighbourhood_group_Queens               uint8
 12   neighbourhood_group_Staten Island        uint8
 13   neighbourhood_Allerton                   uint8
 14   neighbourhood_Arrochar              

In [34]:
airbnb_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19902 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   neighbourhood_group             19902 non-null  object
 1   neighbourhood                   19902 non-null  object
 2   room_type                       19902 non-null  object
 3   price                           19902 non-null  int64 
 4   minimum_nights                  19902 non-null  int64 
 5   number_of_reviews               19902 non-null  int64 
 6   calculated_host_listings_count  19902 non-null  int64 
 7   availability_365                19902 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 1.4+ MB


In [35]:
airbnb_test = pd.read_csv(path + "airbnb_data_dev.csv")
airbnb_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38736 entries, 0 to 38735
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              38736 non-null  int64  
 1   host_id                         38736 non-null  int64  
 2   neighbourhood_group             38736 non-null  object 
 3   neighbourhood                   38736 non-null  object 
 4   latitude                        38736 non-null  float64
 5   longitude                       38736 non-null  float64
 6   room_type                       38736 non-null  object 
 7   price                           38736 non-null  int64  
 8   minimum_nights                  38736 non-null  int64  
 9   number_of_reviews               38736 non-null  int64  
 10  reviews_per_month               38736 non-null  float64
 11  calculated_host_listings_count  38736 non-null  int64  
 12  availability_365                