In [None]:
import sys
!{sys.executable} -m pip install reverse_geocoder
!{sys.executable} -m pip install geopy

In [40]:
import pandas as pd
import reverse_geocoder as rg
from pprint import pprint
from geopy.geocoders import Nominatim

In [3]:
airbnb_df = pd.read_csv("AB_NYC_2019.csv") # external dataset
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0) # training dataset

For this external dataset, I want to extract the average price per "space unit" (either zip code, borough, or neighborhood). There can be large discrepencies between rent prices across a single borough (such as Harlem vs Upper East Side in Manhattan) so this would only be used as a last resort. It would also be the easiest option as the EDA below shows both the training data and the Airbnb data include some reference to the borough. 

In [49]:
train_df.sample(5).transpose()

rental_id,7357615,7218382,7417561,7373953,7377523
addr_unit,#3C,#36,#3AA,#5HH,#2
building_id,348871,668974,54412,378199,1244827
bedrooms,1,1,0,4,2
bathrooms,1,1,1,2,1
size_sqft,805,650,0,1000,1000
created_at,2018-07-11 20:12:15,2018-05-30 09:37:34,2018-07-27 18:32:34,2018-07-17 11:46:49,2018-07-18 08:09:29
addr_street,150 NORTH 12TH STREET,150 96 STREET,814 10 AVENUE,17 MONITOR STREET,39-20 47 AVENUE
addr_city,Brooklyn,Brooklyn,New York,Brooklyn,Sunnyside
addr_zip,11249,11209,10019,11222,11104
addr_lat,40.7211,40.6162,40.7672,40.7194,40.7417


In [95]:
# Will need to do some cleaning since BROOKLYN and QUEENS are counted separately.
# This column is a rough mix between boroughs and neighborhoods so it might be necessary to convert lat/longs 
# to neighborhoods and add that as a key column for the eventual join. 
train_df["addr_city"].value_counts()

(12000, 38)

In [115]:
airbnb_df.head(5).transpose()

Unnamed: 0,0,1,2,3,4
id,2539,2595,3647,3831,5022
name,Clean & quiet apt home by the park,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park
host_id,2787,2845,4632,4869,7192
host_name,John,Jennifer,Elisabeth,LisaRoxanne,Laura
neighbourhood_group,Brooklyn,Manhattan,Manhattan,Brooklyn,Manhattan
neighbourhood,Kensington,Midtown,Harlem,Clinton Hill,East Harlem
latitude,40.6475,40.7536,40.809,40.6851,40.7985
longitude,-73.9724,-73.9838,-73.9419,-73.9598,-73.944
room_type,Private room,Entire home/apt,Private room,Entire home/apt,Entire home/apt
price,149,225,150,89,80


In [5]:
airbnb_df["room_type"].describe()

count               48895
unique                  3
top       Entire home/apt
freq                25409
Name: room_type, dtype: object

In [6]:
airbnb_df["room_type"].value_counts()

Entire home/apt    25409
Private room       22326
Shared room         1160
Name: room_type, dtype: int64

In [7]:
home_airbnb_df = airbnb_df.loc[airbnb_df['room_type'] == "Entire home/apt"]
home_airbnb_df.shape

(25409, 16)

In [28]:
train_df["addr_zip"].unique().size

149

In [29]:
home_airbnb_df.sample(5).transpose()
# Possible keys for aggregation: neighborhood_group, neighborhood, latitude, longitude
# Can also possibly look at entire dataset (not just entire home/apt listings)

Unnamed: 0,5891,7126,37438,28925,6288
id,4294969,5179785,29705049,22291811,4593939
name,Welcome to my home,3 bedroom 2 1/2 bath duplex Park Slope,"Sunny, Cozy Apt in the Heart of BedStuy for Cheap",Charming flat with Williamsburg view,Charming and Convenient Garden Apt
host_id,4473916,10711342,82286,162923870,5050537
host_name,Ross,Mr. & Mrs. Kris,Linda,Etienne,Melissa
neighbourhood_group,Manhattan,Brooklyn,Brooklyn,Brooklyn,Brooklyn
neighbourhood,Greenwich Village,Sunset Park,Bedford-Stuyvesant,Williamsburg,Bedford-Stuyvesant
latitude,40.7317,40.663,40.6861,40.7197,40.684
longitude,-73.9952,-73.9901,-73.9356,-73.9643,-73.9416
room_type,Entire home/apt,Entire home/apt,Entire home/apt,Entire home/apt,Entire home/apt
price,450,150,79,149,69


In [14]:
home_airbnb_df["neighbourhood_group"].value_counts()

Manhattan        13199
Brooklyn          9559
Queens            2096
Bronx              379
Staten Island      176
Name: neighbourhood_group, dtype: int64

In [53]:
home_airbnb_df["neighbourhood"].unique()

array(['Midtown', 'Clinton Hill', 'East Harlem', 'Murray Hill',
       'Chinatown', 'Upper West Side', 'West Village', 'Williamsburg',
       'Fort Greene', 'Crown Heights', 'Bedford-Stuyvesant',
       "Hell's Kitchen", 'East Village', 'Bushwick', 'South Slope',
       'Harlem', 'Prospect-Lefferts Gardens', 'Greenpoint', 'Kips Bay',
       'SoHo', 'Chelsea', 'Upper East Side', 'Prospect Heights',
       'Park Slope', 'Lower East Side', 'Flatbush', 'Brooklyn Heights',
       'Carroll Gardens', 'Gowanus', 'Cobble Hill', 'Boerum Hill',
       'Financial District', 'Ridgewood', 'Middle Village',
       'Ditmars Steinway', 'Flatiron District', 'Greenwich Village',
       'Little Italy', 'East Flatbush', 'Astoria', 'Eastchester',
       'Washington Heights', 'Kingsbridge', 'Forest Hills', 'Nolita',
       'Windsor Terrace', 'Woodlawn', 'Gravesend', 'Gramercy', 'Allerton',
       'East New York', 'Sheepshead Bay', 'Theater District',
       'Long Island City', 'Fort Hamilton', 'Bensonhurst',

# Testing Reverse Geocoders
I tried testing 2 different geocoding libraries to determine which would provide me with the actual neighborhood. The first (reverse_geocoder) only returns the name of the city (NYC, Brooklyn, etc) whereas the second one returns a much more detailed address including the neighborhood.

One downside to using geopy is that it is an online reverse geocoding API that might not be suitable for larger projects. As the section of the Airbnb dataset I am looking at only consists of ~25000 items, it should be fine for the purposes of this final project.

In [43]:
# Testing reverse_geocoder 
coords = [(40.7317, -73.9952), (40.663, -73.9901)]
for coord in coords: 
    pprint(rg.search(coord))

[OrderedDict([('lat', '40.71427'),
              ('lon', '-74.00597'),
              ('name', 'New York City'),
              ('admin1', 'New York'),
              ('admin2', ''),
              ('cc', 'US')])]
[OrderedDict([('lat', '40.6501'),
              ('lon', '-73.94958'),
              ('name', 'Brooklyn'),
              ('admin1', 'New York'),
              ('admin2', 'Kings County'),
              ('cc', 'US')])]


In [61]:
# Trying different package: geopy
geolocator = Nominatim()
for coord in coords: 
    location = str(geolocator.reverse(coord))
    nbrhd = location.split(',')
    print(nbrhd)

  


['New York University', ' 6th Avenue', ' University Village', ' Greenwich Village', ' Manhattan Community Board 2', ' Manhattan', ' New York County', ' New York', ' 10019', ' United States of America']
['Purple Playground', ' 17th Street', ' Park Slope', ' Brooklyn', ' Kings County', ' New York', ' 11215', ' United States of America']


In [133]:
def combine(lat, lon):
    coord = str(lat) + ',' + str(lon)
    return coord

def lookupAddress(lat,lon):
    location = str(geolocator.reverse(combine(lat, lon)))
    return location

In [134]:
# This times out because there are in fact too many requests being sent to the online API. 
# There is also the problem of accessing the correct neighborhood from the address as it is either at the 3rd or 4th
# index. 
airbnb_df['coords'] = airbnb_df.apply (lambda row: lookupAddress(row.latitude, row.longitude), axis=1)

GeocoderTimedOut: ('Service timed out', 'occurred at index 6')