In [1]:
import csv
import json
import feather # pip install feather-format
import numpy as np
import pandas as pd
import os
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import geopandas as gpd
import ast
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
import geopy.distance
%matplotlib inline
## Some utility functions
def percentToInt(percentString):
    try:
        return int(percentString.rstrip("%"))
    except:
        return None
def trueFalseToBinary(tfstring):
    return 1 if tfstring=='t' else '0'
def stringListToList(stringList):
    return ast.literal_eval(stringList)
def stringListToCount(stringList):
    return len(ast.literal_eval(stringList))
def euclideanDistance(lat1, long1, lat2, long2):
    return ((long2-long1)**2 + (lat2-lat1)**2)**(0.5)
    
reviews_data_path = "feather/reviews.feather"
listings_data_path = "feather/listings.feather"
calendar_data_path = "feather/calendar.feather"
sites_data_path = "data/sites_boston.csv"

reviews_data = pd.read_feather(reviews_data_path)
listings_data = pd.read_feather(listings_data_path)
calendar_data = pd.read_feather(calendar_data_path)
sites_data = pd.read_csv(sites_data_path)

# subway data
# https://api-v3.mbta.com/docs/swagger/index.html#/Stop
# holidays data
# https://pypi.org/project/holidays/

In [7]:
listings_data = pd.read_csv("data/listings.csv")
df_none_null = listings_data.copy(deep = True)
reference_date = datetime.datetime(2021,7,12)
# adding custom: host_number_of_years
host_number_of_years = [(reference_date - datetime.datetime.strptime(i, '%Y-%m-%d')).days/365
                        for i in df_none_null['host_since']]
df_none_null["host_number_of_years"] = host_number_of_years
df_none_null = df_none_null.drop(['host_since'], axis=1)
df_none_null['price'] = df_none_null['price'].map(lambda x:float(x[1:].replace(',', '')))
df_none_null['host_response_rate'] = df_none_null['host_response_rate'].map(percentToInt)
#  host_acceptance_rate:
df_none_null['host_acceptance_rate'] = df_none_null['host_acceptance_rate'].map(percentToInt)

host_response_time_dummies = pd.get_dummies(df_none_null['host_response_time'],prefix='host_response_time')
df_none_null = pd.concat([df_none_null,host_response_time_dummies], axis = 1)
df_none_null = df_none_null.drop(['host_response_time'], axis=1)

df_none_null['host_verifications'] = df_none_null['host_verifications'].map(stringListToCount).astype('int32')
df_none_null['availability_30'] = df_none_null['availability_30']/30.0
# "availability_60",
df_none_null['availability_60'] = df_none_null['availability_60']/60.0
# "availability_90",
df_none_null['availability_90'] = df_none_null['availability_90']/90.0
# "availability_365", 
df_none_null['availability_365'] = df_none_null['availability_365']/365.0



In [10]:

landmarks=sites_data["Place"]
landmarks_lats = sites_data["Latitude"]
landmarks_longs = sites_data["Longitude"]

# pending further considerations
def RMeanSquared(list_distances):
    output = []
    for i in list_distances:
        output.append(i**2)
    return np.sqrt(np.mean(output))

def SortAndGetTop(list_distances, LargeFirst=False, n=5):
    list_distances.sort(reverse=LargeFirst)
    return list_distances[0:n]

result = []

for i in range(len(df_none_null.index)):
    dist_to_each_landmark = []
    lat2,long2 =df_none_null.iloc[i].latitude,df_none_null.iloc[i].longitude
    for j, landmark in enumerate(landmarks):
        # euclideanDistance(lat1, long1, lat2, long2)
        lat1,long1 = landmarks_lats[j],landmarks_longs[j]
        # dist_to_each_landmark.append(euclideanDistance(lat1, long1, lat2, long2))
        dist_to_each_landmark.append(geopy.distance.distance((lat1, long1),(lat2, long2)).km)
    # result.append(min(dist_to_each_landmark))
    result.append(RMeanSquared(SortAndGetTop(dist_to_each_landmark)))

df_none_null["closeness_to_landmark"] = result


In [11]:

import simplejson as json
subway_data_path = "data/transport/subway.json"
# attributes.name .longitude .latitude .description
# id
# within attributes.description , there is Red Blue Green Orange Line info

subway_names = []
subway_lines = []
subway_lats = []
subway_longs = []
with open(subway_data_path) as f:
    subway_data = json.load(f)['data']
    for i in subway_data:
        subway_names.append(i['attributes']['name'])
        subway_lats.append(i['attributes']['latitude'])
        subway_longs.append(i['attributes']['longitude'])
        if("Red Line" in i['attributes']['description']):
            subway_lines.append("r")
            # print(i['attributes']['description'])
        elif("Blue Line" in i['attributes']['description']):
            subway_lines.append("b")
        elif("Orange Line" in i['attributes']['description']):
            subway_lines.append("o")
        else:
            subway_lines.append(None)


result = []

for i in range(len(df_none_null.index)):
    dist_to_each_subway = []
    for j, subway in enumerate(subway_names):
        # euclideanDistance(lat1, long1, lat2, long2)
        lat1,long1 = subway_lats[j],subway_longs[j]
        lat2,long2 =df_none_null.iloc[i].latitude,df_none_null.iloc[i].longitude
        dist_to_each_subway.append(geopy.distance.distance((lat1, long1),(lat2, long2)).km)
    # result.append(min(dist_to_each_subway))
    result.append(RMeanSquared(SortAndGetTop(dist_to_each_subway)))

df_none_null["closeness_to_subway"] = result

        

In [12]:
df_none_null.to_csv("data/listings_postgres.csv")

In [9]:
def price_to_float(price):
    try:
        if price == "":
            return 0
        else:
            return float(price[1:].replace(',', ''))
    except:
        return 0
calendar_data = pd.read_csv(
    "data/calendar.csv"
)
calendar_data['price'] = calendar_data['price'].map(price_to_float)
calendar_data['adjusted_price']= calendar_data['adjusted_price'].map(price_to_float)


In [10]:
calendar_data.to_csv("data/calendar_postgres.csv")