In [1]:
# Libraries set-up

%load_ext lab_black
import os
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import mean_squared_error
from readtable import getairbnbdata

In [2]:
# Import data

# data_2021 = getairbnbdata()
# data_2021.columns

data_2021 = pd.read_csv(
    "../data/cleaned_data_updated.csv",
)
data_2021.columns

Index(['Unnamed: 0', 'id', 'month', 'List_month', 'last_scraped', 'host_id',
       'host_name', 'host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'latitude', 'longitude',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count', 'reviews_per_month', 'amenities',
       'scrape_batch', 'ba

In [40]:
# Keep interesting subset of columns for all analyses, drop empty rows, remove data with wrong boroughs,
# rename borough and neighborhood columns

data = data_2021[
    [
        "price",
        "new_neighbourhood",
        "neighbourhood",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
        "station_dist",
        "station_dist2",
        "park_dist",
        "park_dist2",
    ]
]

data = data.dropna()
data = data.reset_index()

data["borough"] = data["neighbourhood"].values
data.drop(["neighbourhood"], axis=1, inplace=True)
data["neighbourhood"] = data["new_neighbourhood"].values
data.drop(["new_neighbourhood"], axis=1, inplace=True)

data = data[
    data["borough"]
    == (
        "Brooklyn, New York, United States"
        or "Manhattan, New York, United States"
        or "Queens, New York, United States"
        or "Bronx, New York, United States"
        or "Staten Island, New York, United States"
    )
]

data = data.reset_index()
data["borough"] = data["borough"].str.replace(", New York, United States", "")

In [4]:
# Set up price as Y (dependent variable) for all regressions

Y = data["price"]

In [36]:
# Define function that outputs regressor matrix depending on what we want to do using subset of data taking as an input a list of regressors

regressors = []


def Xcreator(regressors):
    datax = data[regressors]
    if "borough" in datax.columns:
        datax = pd.get_dummies(datax, columns=["borough"])
    if "neighbourhood" in datax.columns:
        datax = pd.get_dummies(datax, columns=["neighbourhood"])
    Xt = datax
    X = sm.add_constant(Xt)
    return X

In [37]:
def regress(y, x):
    est = sm.OLS(y, x)
    estr = est.fit()
    estr.summary()

In [38]:
X = Xcreator(["borough"])
X
# regress(Y, X)

Unnamed: 0,borough_Brooklyn
0,1
1,1
2,1
3,1
4,1
...,...
8788,1
8789,1
8790,1
8791,1


In [None]:
# Set up data for simple linear regression of price on neighborhood group, Bronx used as baseline

datang = data[
    [
        "price",
        "neighbourhood_group_cleansed",
    ]
]
datang = pd.get_dummies(datang, columns=["neighbourhood_group_cleansed"])
datang.drop(["neighbourhood_group_cleansed_Bronx"], axis=1, inplace=True)
datang.astype(float)

Xng = datang
datang.drop(["price"], axis=1, inplace=True)
XngC = sm.add_constant(Xng)

In [None]:
# Run Linear regression of price on neighborhood group dummies

estng = sm.OLS(Y, XngC)
est_ng = estng.fit()
est_ng.summary()

In [None]:
# Set up data for simple linear regression of price on neighborhood, Midtown used as baseline

datan = data[
    [
        "price",
        "new_neighbourhood",
    ]
]
datan = pd.get_dummies(datan, columns=["new_neighbourhood"])
datan.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
datan.astype(float)

Xn = datan
datan.drop(["price"], axis=1, inplace=True)
XnC = sm.add_constant(Xn)

In [None]:
# Run Linear regression of price on neighborhood group dummies

estn = sm.OLS(Y, XnC)
est_n = estn.fit()
est_n.summary()

In [None]:
# Neighbourhoods specifically seem to better predict price than boroughs (groups). We now do the same process adding other room type,
# total reviews, average reviews per month, and host total listings

data3 = data[
    [
        "price",
        "new_neighbourhood",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
    ]
]
data3 = pd.get_dummies(data3, columns=["new_neighbourhood"])
data3.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data3.astype(float)

X3 = data3
data3.drop(["price"], axis=1, inplace=True)
X3C = sm.add_constant(X3)

est3 = sm.OLS(Y, X3C)
est_3 = est3.fit()
est_3.summary()

In [None]:
# Finally, we add to the former regression two regressors: distance from nearest park. We first do this using linear distance and
# second by using taxi distance and choose which is better

In [None]:
# Using linear distance

data4 = data[
    [
        "price",
        "new_neighbourhood",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "station_dist",
        "park_dist",
    ]
]
data4 = pd.get_dummies(data4, columns=["new_neighbourhood"])
data4.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data4.astype(float)

X4 = data4
data4.drop(["price"], axis=1, inplace=True)
X4C = sm.add_constant(X4)

est4 = sm.OLS(Y, X4C)
est_4 = est4.fit()
est_4.summary()

In [None]:
# Using taxi distance

data5 = data[
    [
        "price",
        "new_neighbourhood",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "station_dist2",
        "park_dist2",
    ]
]
data5 = pd.get_dummies(data5, columns=["new_neighbourhood"])
data5.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data5 = pd.get_dummies(data5, columns=["room_type"])
data5.drop(["room_type_Private room"], axis=1, inplace=True)
data5.astype(float)

X5 = data5
data5.drop(["price"], axis=1, inplace=True)
X5C = sm.add_constant(X5)

est5 = sm.OLS(Y, X5C)
est_5 = est5.fit()
est_5.summary()

In [None]:
# Taxi distance seems to be a better indicator than linear distance.

In [None]:
model = LinearRegression()
model.fit(X5C, Y)
X_predict = X5C
Y_predict = model.predict(X_predict)
rmse = mean_squared_error(Y, Y_predict, squared=False)
rmse