In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from IPython.core.display import display, HTML
from gsmls import get_listing_detail_preview

pd.set_option('display.max_rows', None)

def preview_mls(mlsid):
    display(HTML(get_listing_detail_preview(mlsid)))

def preview_df(df, columns):
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        a[columns].to_html(formatters={'__index__':
                                        lambda id: f'<a href="https://gsmls.herokuapp.com/properties/{id}">{id}</a>'},
                           escape=False)))

**Download listings from [GSMLS](https://www.gsmls.com/)**

In [4]:
from gsmls import get_listings

# counties = ['Morris', 'Bergen', 'Hudson', 'Union', 'Passaic', 'Essex']
counties = ['Morris', 'Bergen', 'Hudson', 'Union', 'Essex']

listings_list = []
for county in tqdm(counties):

    listings_list += get_listings(county,
                           min_list_price=300000,
                           max_list_price=400000,
                           min_bedrooms=3,
                           min_bathrooms=2)

    listings_list += get_listings(county,
                           min_list_price=400000,
                           max_list_price=500000,
                           min_bedrooms=3,
                           min_bathrooms=2)

listings = {int(x['id']):x for x in listings_list if x['address']}

print(f"Downloaded {len(listings)} listings from GSMLS.")


Downloaded 941 listings from GSMLS.


**Geocode each address using Google Maps API**

In [5]:
from geolocate import geocode
for listing in tqdm(listings.values()):
    source = listing['address'] + ' ' + listing['city/town'] + ', ' + 'NJ'
    listing['geocoded'] = geocode(source)




**Find closest NJ Transit Park and Ride to each address**

In [6]:
from geopy.distance import vincenty

with open('park_and_rides.json') as f:
    park_and_rides = json.load(f)

def get_listing_location(listing):
    loc = listing['geocoded'][0]['geometry']['location']
    return (loc['lat'], loc['lng'],)

for listing in tqdm(listings.values()):
    if len(listing['geocoded']) == 0: continue
    distances = [vincenty(pr['location'], get_listing_location(listing)).miles for pr in park_and_rides]
    closest_index = np.argmin(distances)
    listing['park_and_ride'] = park_and_rides[closest_index]
    listing['park_and_ride_name'] = park_and_rides[closest_index]['name']
    listing['park_and_ride_type'] = park_and_rides[closest_index]['type']
    listing['park_and_ride_distance'] = distances[closest_index]




**Get commute time for each address through both park and ride and walking**

In [7]:
from geolocate import get_directions, get_driving_directions

for listing in tqdm(listings.values()):
    if not listing.get('park_and_ride'): continue

    # calculate time from home -> park and ride
    source = listing['address'] + ' ' + listing['city/town'] + ', ' + 'NJ'
    destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
    directions = get_directions(source, destination, mode='driving')
    listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
    listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
    listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

    # calculate time from park and ride -> NY
    if listing['park_and_ride']['type'] == 'rail':
        destination = 'New York Penn Station'
    else:
        destination = 'Port Authority Bus Terminal'
    source = str(tuple(listing['park_and_ride']['location']))[1:-1]
    directions = get_directions(source, destination, mode='transit')
    listing['park_and_ride_duration2'] = directions['duration']['value'] if directions else None
    listing['park_and_ride_duration_text2'] = directions['duration']['text'] if directions else None
    listing['park_and_ride_instructions2'] = directions['instructions'] if directions else None
    
    # sum total time from home -> NY
    listing['park_and_ride_duration'] = listing['park_and_ride_duration1'] + listing['park_and_ride_duration2']
    listing['park_and_ride_duration_text'] = ', '.join([listing['park_and_ride_duration_text1'], listing['park_and_ride_duration_text2']])

    # get walking/transit destination from source to port authority
    source = listing['address'] + ' ' + listing['city/town'] + ', ' + 'NJ'
    destination = 'Port Authority Bus Terminal'
    directions = get_directions(source, destination, mode='transit')
    listing['nyc_duration'] = directions['duration']['value'] if directions else None
    listing['nyc_duration_text'] = directions['duration']['text'] if directions else None
    listing['nyc_instructions'] = directions['instructions'] if directions else None

print(sum(1 for listing in listings.values() if listing.get('park_and_ride_duration')),
      "out of", len(listings), "have park and ride commute times.")

print(sum(1 for listing in listings.values() if listing.get('nyc_duration')),
  "out of", len(listings), "have walking commute times.")


918 out of 941 have park and ride commute times.
731 out of 941 have walking commute times.


**Filter out listings**

In [8]:
ONE_HOUR_TEN_MINUTES = 3960
ONE_HOUR_THIRTY_MINUTES = 5400
ONE_HOUR_FORTY_MINUTES = 6000
blacklist_cities = [
    'Paterson City', 'West Orange Twp.', 'Belleville Twp.', 'Elizabeth City',
    'Jefferson Twp.', 'Passaic City', 'Newark City']
df = pd.DataFrame(listings).T
df.drop('id', axis=1, inplace=True)
a = df[
    (df['address'].notnull()) & # must have non-null address
    (~df['city/town'].isin(blacklist_cities)) &
#     (df['park_and_ride_duration'] <= ONE_HOUR_THIRTY_MINUTES) &
    (df['sq_ft'] > 2000) &
    (df['tax'] < 10000) &
    (~df['heat_source'].str.contains('oil', case=False, na=True)) # includes listings with heat source = null
]

# (df['nyc_duration'].notnull()) & # must have nyc transit duration
# (df['nyc_duration'] <= ONE_HOUR_TEN_MINUTES) &
# (df['style'] == 'Colonial') &

print(len(a), "listings after filtering")

# oil
# Water: Well 
# Sewer: Septic

40 listings after filtering


In [9]:
# nyc_duration_text
columns = ['address', 'bedrooms', 'total_baths', 'city/town', 'county', 'price', 'sq_ft', 'tax', 'style', 'rooms', 'park_and_ride_name', 'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance',]
a = a.sort_values(by=['park_and_ride_duration', 'park_and_ride_distance'], ascending=[True, True])
preview_df(a, columns)

Unnamed: 0,address,bedrooms,total_baths,city/town,county,price,sq_ft,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance
3415348,65 Howcroft Rd,5,2.0,Maywood Boro,Bergen,449000,2304,9979,Bi-Level,10,Essex Street,"6 mins, 35 mins",rail,1.04117
3407560,26 W Elm St,4,3.0,Linden City,Union,499000,2600,0,Bi-Level,8,Linden,"3 mins, 39 mins",rail,0.400046
3396709,514 E Price St,5,3.0,Linden City,Union,399900,2419,5990,Bi-Level,9,Linden,"4 mins, 39 mins",rail,0.395793
3433084,600 Harrison Pl,4,2.1,Linden City,Union,450000,2600,9201,Colonial,8,Linden,"5 mins, 39 mins",rail,0.642435
3428886,150 Lexington Ave,5,3.0,Linden City,Union,449999,2700,9920,Bi-Level,8,Linden,"5 mins, 39 mins",rail,1.01406
3433455,786 Green Ln,3,3.1,Union Twp.,Union,389000,2100,9993,"Multi Floor Unit, Townhouse-Interior",7,Union,"2 mins, 42 mins",rail,0.176066
3424951,665 Millers Ln,4,2.1,Rahway City,Union,449000,2046,4300,Colonial,7,Rahway,"4 mins, 43 mins",rail,0.646716
3428874,1912 Montgomery St,4,3.0,Rahway City,Union,349000,2300,8605,"Colonial, Detached",10,Rahway,"4 mins, 43 mins",rail,0.534094
3414127,156 MOUNTAIN AVE UNIT 3,3,2.2,Springfield Twp.,Union,499999,2700,0,"Multi Floor Unit, Townhouse-Interior",7,Springfield Center - Springfield Twp,"1 min, 46 mins",bus,0.315407
3414146,156 Mountain Ave Unit 7,3,2.2,Springfield Twp.,Union,499999,2700,0,"Multi Floor Unit, Townhouse-Interior",7,Springfield Center - Springfield Twp,"1 min, 46 mins",bus,0.315407
