In [1]:
%reload_ext autoreload
%autoreload 2

In [46]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from IPython.core.display import display, HTML

pd.set_option('display.max_rows', None)

def preview_gsmls_df(mydf):
    columns = ['address', 'city', 'county', 'bedrooms', 'baths_full', 'baths_part', 'price', 'sq_ft', 'tax', 'style', 'rooms', 'park_and_ride_name', 'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance', 'source']
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(formatters={'__index__':
                                        lambda id: f'<a href="https://gsmls.herokuapp.com/properties/{id}">{id}</a>'},
                           escape=False)))

def preview_njmls_df(mydf):
    columns = ['address', 'city', 'bedrooms', 'baths_full', 'baths_part', 'price', 'tax', 'style', 'rooms', 'list_date', 'park_and_ride_name', 'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance', 'source']
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(formatters={'__index__':
                                        lambda id: f'<a href="https://www.njmls.com/listings/index.cfm?action=dsp.info&mlsnum={id}">{id}</a>'},
                           escape=False)))

**Define global parameters**

In [3]:
# counties = ['Morris', 'Bergen', 'Hudson', 'Union', 'Passaic', 'Essex', 'Middlesex']
# counties = ['Morris', 'Bergen', 'Hudson', 'Union', 'Essex']
# counties = ['Passaic', 'Bergen']
counties = ['Morris', 'Middlesex']

**Download listings from [GSMLS](https://www.gsmls.com/)**

In [4]:
from gsmls import get_listings

gsmls_listings = []
for county in tqdm(counties):

    gsmls_listings += get_listings(county,
                           min_list_price=300000,
                           max_list_price=400000,
                           min_bedrooms=3,
                           min_bathrooms=2)

    gsmls_listings += get_listings(county,
                           min_list_price=400000,
                           max_list_price=500000,
                           min_bedrooms=3,
                           min_bathrooms=2)

print(f"Downloaded {len(gsmls_listings)} listings from GSMLS.")
gsmls_listings = [x for x in gsmls_listings if x['address']]
print(f"Filtered down to {len(gsmls_listings)} listings due to null address.")

for listing in gsmls_listings:
    listing['source'] = 'GSMLS'


Downloaded 564 listings from GSMLS.
Filtered down to 543 listings due to null address.


**Download listings from [NJMLS](http://www.njmls.com/)**

In [5]:
from njmls import get_listings, get_listing_detail

njmls_listings = []
for county in tqdm(counties):
    current_listings = list(get_listings(
            min_beds=3,
            min_baths=2,
            county_search=True,
            max_price=500000,
            counties=[county.upper()],
            proptypes=['1']))

    for listing in tqdm(current_listings):
        listing_detail = get_listing_detail(listing['id'])
        listing_detail['lat'] = listing['lat']
        listing_detail['lng'] = listing['lng']
        njmls_listings.append(listing_detail)

print(f"Downloaded {len(njmls_listings)} listings from NJMLS.")

for listing in njmls_listings:
    listing['source'] = 'NJMLS'


Downloaded 64 listings from NJMLS.


In [37]:
listings = gsmls_listings + njmls_listings

**Geocode each address using Google Maps API**

In [36]:
from geolocate import geocode
for listing in tqdm(listings):
    source = listing['address'] + ' ' + listing['city'] + ', ' + 'NJ'
    geocoded = geocode(source)
    if len(geocoded) == 0:
        print(f"could not geocode: {source}")
        continue
    listing['lat'] = geocoded[0]['geometry']['location']['lat']
    listing['lng'] = geocoded[0]['geometry']['location']['lng']
    listing['formatted_address'] = geocoded[0]['formatted_address']
    try:
        listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'locality' in x['types']][0]
    except IndexError:
        listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_3' in x['types']][0]
    listing['county'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_2' in x['types']][0]

could not geocode: 4 Shadtree Lane Mount Arlington Boro, NJ
could not geocode: 2 Shadetree Lane Mount Arlington Boro, NJ



**Find closest NJ Transit Park and Ride to each address**

In [38]:
from geopy.distance import vincenty

with open('park_and_rides.json') as f:
    park_and_rides = json.load(f)

# TODO: find the closest BUS vs TRAIN park and ride separately?

for listing in tqdm(listings):
    if not listing.get('lat') or not listing.get('lng'): continue
    distances = [vincenty(pr['location'], (listing['lat'], listing['lng'],)).miles for pr in park_and_rides]
    closest_index = np.argmin(distances)
    listing['park_and_ride'] = park_and_rides[closest_index]
    listing['park_and_ride_name'] = park_and_rides[closest_index]['name']
    listing['park_and_ride_type'] = park_and_rides[closest_index]['type']
    listing['park_and_ride_distance'] = distances[closest_index]





**Get commute time for each address through both park and ride and walking**

In [40]:
from geolocate import get_directions, get_driving_directions

for listing in tqdm(listings):
    if not listing.get('park_and_ride'): continue

    # calculate time from home -> park and ride
    source = listing['formatted_address']
    destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
    directions = get_directions(source, destination, mode='driving')
    if directions is None:
        print(f"unable to find driving directions from home ({source}) to park and ride ({destination})")
    listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
    listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
    listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

    # calculate time from park and ride -> NY
    if listing['park_and_ride']['type'] == 'rail':
        destination = 'New York Penn Station'
    else:
        destination = 'Port Authority Bus Terminal'
    source = str(tuple(listing['park_and_ride']['location']))[1:-1]
    directions = get_directions(source, destination, mode='transit')
    if directions is None:
        print(f"unable to find transit directions from park and ride ({source}) to NY ({destination})")
    listing['park_and_ride_duration2'] = directions['duration']['value'] if directions else None
    listing['park_and_ride_duration_text2'] = directions['duration']['text'] if directions else None
    listing['park_and_ride_instructions2'] = directions['instructions'] if directions else None
    
    if not listing.get('park_and_ride_duration1') or not listing.get('park_and_ride_duration2'):
        print(listing)
    # sum total time from home -> NY
    listing['park_and_ride_duration'] = listing['park_and_ride_duration1'] + listing['park_and_ride_duration2']
    listing['park_and_ride_duration_text'] = ', '.join([listing['park_and_ride_duration_text1'], listing['park_and_ride_duration_text2']])

    # get walking/transit destination from source to port authority
    source = listing['formatted_address']
    destination = 'Port Authority Bus Terminal'
    directions = get_directions(source, destination, mode='transit')
    listing['nyc_duration'] = directions['duration']['value'] if directions else None
    listing['nyc_duration_text'] = directions['duration']['text'] if directions else None
    listing['nyc_instructions'] = directions['instructions'] if directions else None

print(sum(1 for listing in listings if listing.get('park_and_ride_duration')),
      "out of", len(listings), "have park and ride commute times.")

print(sum(1 for listing in listings if listing.get('nyc_duration')),
  "out of", len(listings), "have walking commute times.")




Exception in thread Thread-17:
Traceback (most recent call last):
  File "/Users/johria/.pyenv/versions/3.6.3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/johria/.local/share/virtualenvs/nj-house-search-88U_gf0r/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/johria/.local/share/virtualenvs/nj-house-search-88U_gf0r/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




605 out of 607 have park and ride commute times.
404 out of 607 have walking commute times.


**Final GSMLS Listings**

In [44]:
ONE_HOUR_TEN_MINUTES = 3960
ONE_HOUR_THIRTY_MINUTES = 5400
ONE_HOUR_FORTY_MINUTES = 6000

blacklist_cities = [
    'Paterson City', 'West Orange Twp.', 'Belleville Twp.', 'Elizabeth City',
    'Jefferson Twp.', 'Passaic City', 'Newark City']
df1 = pd.DataFrame(gsmls_listings)
df1.set_index('id', inplace=True)

df1 = df1[
    (df1.address.notnull()) & # must have non-null address
    (~df1.city.isin(blacklist_cities)) &
    (df1.sq_ft > 2000) &
    (df1.tax < 12000) &
    (~df1.heat_source.str.contains('oil', case=False, na=True)) &
    (~df1.water.str.contains('well', case=False, na=True)) &
    (~df1.sewer.str.contains('septic', case=False, na=True))
]

df1 = df1.sort_values(by=['park_and_ride_duration', 'park_and_ride_distance'], ascending=[True, True])

# (df['park_and_ride_duration'] <= ONE_HOUR_THIRTY_MINUTES) &
# (df['nyc_duration'].notnull()) & # must have nyc transit duration
# (df['nyc_duration'] <= ONE_HOUR_TEN_MINUTES) &
# (df['style'] == 'Colonial') &

print(len(df1), "listings after filtering")

# nyc_duration_text
preview_gsmls_df(df1)

57 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,price,sq_ft,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3406304,60 Crampton Ave,Woodbridge Township,Middlesex County,4,2,1,435000,2226.0,11298,Bi-Level,7,Woodbridge,"3 mins, 43 mins",rail,0.367371,GSMLS
3425203,333 S Pine Ave,South Amboy,Middlesex County,4,2,0,399900,2900.0,9841,Colonial,10,South Amboy,"3 mins, 48 mins",rail,0.623283,GSMLS
3415638,43 S Shore Dr,South Amboy,Middlesex County,3,3,1,498000,2660.0,11995,"Multi Floor Unit, Townhouse-End Unit",8,South Amboy,"4 mins, 48 mins",rail,0.593926,GSMLS
3400652,190 S FELTUS ST,South Amboy,Middlesex County,3,2,1,390000,2800.0,8836,"Multi Floor Unit, Townhouse-End Unit",8,South Amboy,"4 mins, 48 mins",rail,0.571141,GSMLS
3424983,24 Yorkshire Pl,Sayreville,Middlesex County,6,3,0,385000,7501.0,9034,"Development Home, Ranch",12,Sayreville,"4 mins, 55 mins",bus,0.248533,GSMLS
3406445,375 Miller Ave,South Amboy,Middlesex County,4,2,1,460000,2600.0,8866,Colonial,9,Garden State Parkway Exit 120,"3 mins, 57 mins",bus,1.029879,GSMLS
3432410,52 GRAMERCY DR,Piscataway Township,Middlesex County,5,2,1,449900,2640.0,9544,Colonial,9,Edison,"7 mins, 56 mins",rail,1.910747,GSMLS
3433921,76 Overlook Ave,East Hanover,Morris County,4,2,0,380000,2288.0,7212,Cape Cod,9,Willowbrook Mall,"17 mins, 47 mins",bus,5.945371,GSMLS
3410717,29 Devon Dr,Piscataway Township,Middlesex County,5,2,1,499900,2494.0,9616,"Colonial, Detached",10,Edison,"8 mins, 56 mins",rail,1.806046,GSMLS
3430687,102 New St,Middlesex,Middlesex County,4,2,1,429900,2860.0,10933,Colonial,9,Dunellen,"7 mins, 1 hour 0 mins",rail,1.583544,GSMLS


**Final NJMLS Listings**

In [48]:
df2 = pd.DataFrame(njmls_listings)
df2.set_index('id', inplace=True)
# df2 = df2.sort_values(by=['park_and_ride_duration', 'park_and_ride_distance'], ascending=[True, True])
df2 = df2.sort_values(by='rooms', ascending=False)
print(len(df2), "listings after filtering")
preview_njmls_df(df2)

64 listings after filtering


Unnamed: 0_level_0,address,city,bedrooms,baths_full,baths_part,price,tax,style,rooms,list_date,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1713803,64 Pemberton Dr,Matawan,4,2,1,474990,9662,Col,12,04/11/2017,Aberdeen Matawan,"10 mins, 58 mins",rail,2.3464,NJMLS
1729823,7 Salmon Ln,Roxbury Township,4,2,1,299998,8700,Ranch,12,07/20/2017,Lake Hopatcong,"4 mins, 1 hour 52 mins",rail,1.473954,NJMLS
1732202,45 Lazarus Dr,Roxbury Township,4,2,1,475000,12197,Col,12,08/07/2017,Lake Hopatcong,"8 mins, 1 hour 52 mins",rail,1.566276,NJMLS
1642241,1 Great Sun Ter,Wharton,5,3,0,290000,11270,Contp,12,10/19/2016,Mount Arlington,"18 mins, 1 hour 48 mins",rail,5.896459,NJMLS
1722900,3 Seneca Ave,Rockaway,5,3,1,468000,13139,Col,11,06/06/2017,Denville,"9 mins, 1 hour 16 mins",rail,2.745783,NJMLS
1736725,89 Reservoir Ave,Butler,3,2,1,350000,9001,S/L,11,09/11/2017,Towaco,"15 mins, 1 hour 10 mins",rail,5.118248,NJMLS
1746977,25 Academy St,Dover,4,2,0,285000,6108,Ranch,10,12/01/2017,Dover,"4 mins, 1 hour 22 mins",rail,0.495754,NJMLS
1741850,82 Taylortown Rd,Montville,3,2,1,399000,8213,Ranch,10,10/18/2017,Boonton,"7 mins, 1 hour 16 mins",rail,2.059698,NJMLS
1742810,24 Joyce Dr,Roxbury Township,4,2,1,349999,10671,Col,10,10/26/2017,Mount Arlington,"12 mins, 1 hour 48 mins",rail,4.043165,NJMLS
1738929,2761 S Route 23,Jefferson,6,2,0,400000,5266,Col,10,09/26/2017,Newfoundland - West Milford NJ,"2 mins, 1 hour 15 mins",bus,0.376538,NJMLS
