# NJ Home Search

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from IPython.core.display import display, HTML
from concurrent.futures import ThreadPoolExecutor

pd.set_option('display.max_rows', None)

def preview_gsmls_df(mydf, save=False):
    columns = [
        'address', 'city', 'county', 'bedrooms', 'baths_full', 'baths_part',
        'sqft', 'price', 'tax', 'style', 'rooms', 'park_and_ride_name',
        'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance',
        'park_and_ride_bus_name', 'park_and_ride_bus_distance',
        'barnabas_duration_text', 'nyc_duration_text', 'source']
    if save:
        mydf[columns].to_csv('gsmls.csv')
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(
            formatters={
                '__index__':
                    lambda id: f'<a target="_blank" href="https://gsmls.herokuapp.com/properties/{id}">{id}</a>'},
                        escape=False)))

def preview_njmls_df(mydf, save=False):
    columns = ['address', 'city', 'county', 'bedrooms', 'baths_full', 'baths_part',
               'price', 'tax', 'style', 'rooms', 'list_date', 'park_and_ride_name',
               'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance',
               'park_and_ride_bus_name', 'park_and_ride_bus_distance',
               'barnabas_duration_text', 'source']
    if save:
        mydf[columns].to_csv('njmls.csv')
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(
            formatters={
                '__index__':
                    lambda id: f'<a target="_blank" href="https://www.njmls.com/listings/index.cfm?action=dsp.info&mlsnum={id}">{id}</a>'},
            escape=False)))


**Define global parameters**

In [3]:
counties = ['Passaic', 'Hudson','Essex', 'Middlesex', 'Morris', 'Bergen', 'Union',]

ONE_HOUR = 3600
TEN_MINUTES = ONE_HOUR/6
HALF_HOUR = ONE_HOUR / 2
ONE_HOUR_TEN_MINUTES = ONE_HOUR + TEN_MINUTES
ONE_HOUR_THIRTY_MINUTES = ONE_HOUR + HALF_HOUR
ONE_HOUR_FORTY_MINUTES = ONE_HOUR + HALF_HOUR + TEN_MINUTES

blacklist_cities = [
    'Paterson', 'East Orange', 'City of Orange', 'Belleville', 'Elizabeth',
    'Jefferson', 'Passaic', 'Newark', 'East Newark', 'Linden', 'Nutley',
    'Garfield',]

**Download listings from [GSMLS](https://www.gsmls.com/)**

In [4]:
from gsmls import get_listings

gsmls_listings = []
for county in tqdm(counties):

    gsmls_listings += get_listings(county,
                           min_list_price=350000,
                           max_list_price=400000,
                           min_bedrooms=3,
                           min_bathrooms=2)

    gsmls_listings += get_listings(county,
                           min_list_price=400000,
                           max_list_price=515000,
                           min_bedrooms=3,
                           min_bathrooms=2)

print(f"Downloaded {len(gsmls_listings)} listings from GSMLS.")
gsmls_listings = [x for x in gsmls_listings if x['address']]
print(f"Filtered down to {len(gsmls_listings)} listings due to null address.")

for listing in gsmls_listings:
    listing['source'] = 'GSMLS'


Downloaded 961 listings from GSMLS.
Filtered down to 909 listings due to null address.


**Download listings from [NJMLS](http://www.njmls.com/)**

In [5]:
from njmls import get_listings, get_listing_detail

def get_listing_detail_wrapper(listing):
    listing_detail = get_listing_detail(listing['id'])
    listing_detail['lat'] = listing['lat']
    listing_detail['lng'] = listing['lng']
    return listing_detail
    
njmls_listings = []
for county in tqdm(counties):

    current_listings1 = list(get_listings(
            min_beds=3,
            min_baths=2,
            county_search=True,
            min_price=350000,
            max_price=400000,
            counties=[county.upper()],
            proptypes=['1']))

    current_listings2 = list(get_listings(
            min_beds=3,
            min_baths=2,
            county_search=True,
            min_price=400000,
            max_price=515000,
            counties=[county.upper()],
            proptypes=['1']))

    current_listings = current_listings1 + current_listings2

    listing_ids = [x['id'] for x in current_listings]
    listings_dict = {x['id']:x for x in current_listings}

    with ThreadPoolExecutor(max_workers=20) as e:
        njmls_listings += tqdm(e.map(get_listing_detail_wrapper, current_listings), total=len(current_listings))

print(f"Downloaded {len(njmls_listings)} listings from NJMLS.")

njmls_listings = [x for x in njmls_listings if x['address']]
print(f"Filtered down to {len(njmls_listings)} listings due to null address.")

for listing in njmls_listings:
    listing['source'] = 'NJMLS'


Downloaded 532 listings from NJMLS.
Filtered down to 515 listings due to null address.


In [6]:
listings = gsmls_listings + njmls_listings

**Geocode each address using Google Maps API**

In [7]:
from geolocate import geocode
for listing in tqdm(listings):
    if not listing.get('address'): continue
    source = listing['address'] + ' ' + listing['city'] + ', ' + 'NJ'
    geocoded = geocode(source)
    if len(geocoded) == 0:
        source2 = listing['address'] + ' ' + listing['city'].replace(' City', '').replace(' Boro Twp.', '').replace(' Boro', '') + ', ' + 'NJ'
        geocoded = geocode(source2)
        if len(geocoded) == 0:
            print(f"could not geocode: {source} or {source2}")
            continue
    listing['lat'] = geocoded[0]['geometry']['location']['lat']
    listing['lng'] = geocoded[0]['geometry']['location']['lng']
    listing['formatted_address'] = geocoded[0]['formatted_address']
    try:
        listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'locality' in x['types']][0]
    except IndexError:
        try:
            listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_3' in x['types']][0]
        except IndexError:
            print("could not find city for", listing['formatted_address'], geocoded[0]['address_components'])
    try:
        listing['county'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_2' in x['types']][0].replace(' County', '')
    except IndexError:
        print("could not find county for", listing['source'], listing['id'], listing['formatted_address'], geocoded[0]['address_components'])

could not find county for GSMLS 3426835 46 Veranda Ave, North Caldwell, NJ, USA [{'long_name': '46', 'short_name': '46', 'types': ['street_number']}, {'long_name': 'Veranda Avenue', 'short_name': 'Veranda Ave', 'types': ['route']}, {'long_name': 'North Caldwell', 'short_name': 'North Caldwell', 'types': ['locality', 'political']}, {'long_name': 'New Jersey', 'short_name': 'NJ', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}]
could not geocode: 4 Shadtree Lane Mount Arlington Boro, NJ or 4 Shadtree Lane Mount Arlington, NJ
could not geocode: 2 Shadetree Lane Mount Arlington Boro, NJ or 2 Shadetree Lane Mount Arlington, NJ
could not geocode: 15 Shadetree Ln Mount Arlington Boro, NJ or 15 Shadetree Ln Mount Arlington, NJ
could not geocode: 1 Harbor Front Plz A1 Elizabeth City, NJ or 1 Harbor Front Plz A1 Elizabeth, NJ
could not geocode: 1 Harbor Front Ter D1 Elizabeth City, NJ or 1 Harbor Front 

**Find closest NJ Transit Park and Ride to each address**

In [8]:
from geopy.distance import vincenty

with open('park_and_rides.json') as f:
    park_and_rides = json.load(f)

# TODO: find the closest BUS vs TRAIN park and ride separately?

for i in tqdm(range(len(listings))):
    listing = listings[i]
    if not listing.get('lat') or not listing.get('lng'): continue
    distances = [vincenty(pr['location'], (listing['lat'], listing['lng'],)).miles for pr in park_and_rides]
    closest_index = np.argmin(distances)
    listing['park_and_ride'] = park_and_rides[closest_index]
    listing['park_and_ride_name'] = park_and_rides[closest_index]['name']
    listing['park_and_ride_type'] = park_and_rides[closest_index]['type']
    listing['park_and_ride_distance'] = distances[closest_index]
    
    park_and_rides_bus = [x for x in park_and_rides if x['type'] == 'bus']
    distances_bus = [vincenty(pr['location'], (listing['lat'], listing['lng'],)).miles for pr in park_and_rides_bus]
    closest_index_bus = np.argmin(distances_bus)
    listing['park_and_ride_bus'] = park_and_rides_bus[closest_index_bus]
    listing['park_and_ride_bus_name'] = park_and_rides_bus[closest_index_bus]['name']
    listing['park_and_ride_bus_type'] = park_and_rides_bus[closest_index_bus]['type']
    listing['park_and_ride_bus_distance'] = distances_bus[closest_index_bus]




**Get commute time for each address through both park and ride and walking**

In [9]:
from geolocate import get_directions, get_driving_directions

for i in tqdm(range(len(listings))):
    listing = listings[i]
    if not listing.get('formatted_address'): continue

    # get walking/transit destination from source to port authority
    source = listing['formatted_address']
    destination = 'Port Authority Bus Terminal'
    try:
        directions = get_directions(source, destination, mode='transit')
    except Exception as e:
        pass
    else:
        listing['nyc_duration'] = directions['duration']['value'] if directions else None
        listing['nyc_duration_text'] = directions['duration']['text'] if directions else None
        listing['nyc_instructions'] = directions['instructions'] if directions else None
    
    # get walking/transit destination from source to 94 Old Short Hills Road, Livingston, NJ
    source = listing['formatted_address']
    destination = '94 Old Short Hills Road, Livingston, NJ'
    try:
        directions = get_directions(source, destination, mode='driving')
    except Exception as e:
        pass
    else:
        listing['barnabas_duration'] = directions['duration']['value'] if directions else None
        listing['barnabas_duration_text'] = directions['duration']['text'] if directions else None
        listing['barnabas_instructions'] = directions['instructions'] if directions else None

    if listing.get('park_and_ride'):

        # calculate time from home -> park and ride
        source = listing['formatted_address']
        destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='driving')
        if directions is None:
            print(f"unable to find driving directions from home ({source}) to park and ride ({destination})")
        listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

        # calculate time from home -> park and ride
        source = listing['formatted_address']
        destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='driving')
        if directions is None:
            print(f"unable to find driving directions from home ({source}) to park and ride ({destination})")
        listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

        # calculate time from park and ride -> NY
        if listing['park_and_ride']['type'] == 'rail':
            destination = 'New York Penn Station'
        else:
            destination = 'Port Authority Bus Terminal'
        source = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='transit')
        if directions is None:
            print(f"unable to find transit directions from park and ride ({source}) to NY ({destination})")
        listing['park_and_ride_duration2'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text2'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions2'] = directions['instructions'] if directions else None
    
        if not listing.get('park_and_ride_duration1') or not listing.get('park_and_ride_duration2'):
            print(listing)

        # sum total time from home -> NY
        listing['park_and_ride_duration'] = listing['park_and_ride_duration1'] + listing['park_and_ride_duration2']
        listing['park_and_ride_duration_text'] = ', '.join([listing['park_and_ride_duration_text1'], listing['park_and_ride_duration_text2']])

print(sum(1 for listing in listings if listing.get('park_and_ride_duration')),
      "out of", len(listings), "have park and ride commute times.")

print(sum(1 for listing in listings if listing.get('nyc_duration')),
  "out of", len(listings), "have walking commute times.")


1418 out of 1424 have park and ride commute times.
1214 out of 1424 have walking commute times.


In [10]:
gsmls_filters = lambda x: ((x.address.notnull()) &
                            (~x.city.isin(blacklist_cities)) &
                            (x.barnabas_duration < HALF_HOUR + TEN_MINUTES) &                           
                            ((x.park_and_ride_duration1 < TEN_MINUTES) & (x.park_and_ride_duration < ONE_HOUR_TEN_MINUTES) | (x.nyc_duration < ONE_HOUR_TEN_MINUTES)) &
                            (~x['style'].str.contains('Townhouse', case=False)) &
                            (~x['style'].str.contains('Bi-Level', case=False)) &
                            (~x['style'].str.contains('Cape Cod', case=False)) &
                            (~x['style'].str.contains('Multi Floor Unit', case=False)) &
                            (~x['style'].str.contains('1/2 Duplex', case=False)) &
                            (~x.heat_source.str.contains('oil', case=False, na=True)) &
                            (~x.heat_system.str.contains('radiator', case=False, na=True)) &
                            # (x.cool_system.str.contains('central', case=False, na=True)) &
                            (~x.water.str.contains('well', case=False, na=True)) &
                            (~x.sewer.str.contains('septic', case=False, na=True))
                          )

def show_gsmls():
    df = pd.DataFrame(gsmls_listings)
    df.set_index('id', inplace=True)
    df = df[gsmls_filters(df)]
    df = df.sort_values(by=['park_and_ride_duration', 'park_and_ride_distance'], ascending=[True, True])
    return df

**GSMLS: Large square footage, low taxes**

In [11]:
df1a = show_gsmls()
df1a = df1a[(df1a.sqft > 2000) & (df1a.tax < 10000)]
df1a = df1a.sort_values('barnabas_duration')
print(len(df1a), "listings after filtering")
preview_gsmls_df(df1a)

8 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3424306,980 Moessner Ave,Union,Union,5,3,0,2637.0,429000,8911,Colonial,10,Springfield Center - Springfield Twp,"4 mins, 1 hour 3 mins",bus,0.939318,Springfield Center - Springfield Twp,0.939318,16 mins,57 mins,GSMLS
3435172,6 Boyden Ave,Maplewood,Essex,4,3,1,3724.0,490000,9971,Colonial,9,South Orange,"7 mins, 36 mins",rail,1.054738,Irvington Bus Terminal,1.213625,17 mins,47 mins,GSMLS
3400060,72 Crescent Ave,Totowa,Passaic,3,2,1,2232.0,429000,0,Colonial,8,Little Falls,"6 mins, 1 hour 4 mins",rail,1.310098,Wayne/Route 23 Transit Center,2.166353,30 mins,1 hour 5 mins,GSMLS
3435024,22 Euclid Ave,Ridgefield Park,Bergen,3,2,2,5001.0,479800,8665,Colonial,9,Vince Lombardi,"5 mins, 25 mins",bus,1.637446,Vince Lombardi,1.637446,31 mins,28 mins,GSMLS
3432363,460 Glenwood Ave,Teaneck,Bergen,3,2,0,3419.0,369999,7139,Colonial,6,Hackensack Terminal,"8 mins, 45 mins",bus,2.13727,Hackensack Terminal,2.13727,32 mins,34 mins,GSMLS
3400653,112 S Valley Rd,Lincoln Park,Morris,4,2,1,2371.0,499000,6389,Custom Home,8,Mountain View,"3 mins, 1 hour 11 mins",rail,0.537797,Mothers Park & Ride,0.96576,33 mins,1 hour 3 mins,GSMLS
3434968,6 First St,Pequannock Township,Morris,4,2,1,3300.0,350000,0,Colonial,10,Lincoln Park,"5 mins, 1 hour 5 mins",rail,1.674614,Mothers Park & Ride,2.179765,33 mins,1 hour 3 mins,GSMLS
3424983,24 Yorkshire Pl,Sayreville,Middlesex,6,3,0,7501.0,385000,9034,"Development Home, Ranch",12,Sayreville,"4 mins, 55 mins",bus,0.248533,Sayreville,0.248533,38 mins,1 hour 6 mins,GSMLS


**GSMLS: Large square footage, medium taxes**

In [12]:
df1b = show_gsmls()
df1b = df1b[(df1b.sqft > 2000) & (df1b.tax > 10000) & (df1b.tax < 13500)]
df1b = df1b.sort_values('price')
print(len(df1b), "listings after filtering")
preview_gsmls_df(df1b)

8 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3416166,1148-50 CUSHING RD,Plainfield,Union,5,3,0,2016.0,364900,12500,Ranch,13,Netherwood,"4 mins, 1 hour 2 mins",rail,0.825262,Watchung Park & Ride,1.83482,35 mins,1 hour 29 mins,GSMLS
3410954,425 Pine Brook Rd,Lincoln Park,Morris,3,3,1,2406.0,388000,11438,"Colonial, Custom Home",8,Lincoln Park,"3 mins, 1 hour 5 mins",rail,0.693314,Mothers Park & Ride,1.817466,30 mins,1 hour 6 mins,GSMLS
3436476,21 Rockaway Pl,Parsippany-Troy Hills,Morris,4,2,1,2120.0,434900,10208,"Colonial, Split Level",8,Boonton,"10 mins, 1 hour 16 mins",rail,2.532123,Willowbrook Mall,6.263236,21 mins,1 hour 8 mins,GSMLS
3425447,87 Hirliman Rd,Teaneck,Bergen,4,2,1,2560.0,475000,11446,Split Level,10,New Bridge Landing,"8 mins, 49 mins",rail,2.167687,Hackensack Terminal,2.618347,40 mins,38 mins,GSMLS
3433211,126 Linwood Ter,Clifton,Passaic,3,3,0,2200.0,489500,11067,Colonial,7,Allwood Road,"5 mins, 33 mins",bus,0.897368,Allwood Road,0.897368,27 mins,41 mins,GSMLS
3378717,4514 Liberty Ave,North Bergen,Hudson,4,2,1,2913.0,499000,10595,Colonial,9,Tonnelle Avenue,"1 min, 26 mins",light_rail,0.232849,North Bergen,0.742872,31 mins,30 mins,GSMLS
3434419,157 Indian Run Pkwy,Union,Union,4,4,0,2800.0,499000,10477,"Colonial, Custom Home",11,Union Center - Union Twp,"4 mins, 47 mins",bus,0.639539,Union Center - Union Twp,0.639539,23 mins,1 hour 9 mins,GSMLS
3430057,7 Robinwood Drive,Little Falls,Passaic,5,3,1,3080.0,499900,12758,Colonial,9,Montclair State University,"6 mins, 57 mins",rail,1.020244,Allwood Road,2.453861,25 mins,50 mins,GSMLS


**GSMLS: Large square footage, high taxes**

In [13]:
df1bb = show_gsmls()
df1bb = df1bb[(df1bb.sqft > 2000) & (df1bb.tax > 13500) & (df1bb.tax < 15000)]
df1bb = df1bb.sort_values('tax')
print(len(df1bb), "listings after filtering")
preview_gsmls_df(df1bb)

4 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3434778,79 Winding Ln,Bloomfield,Essex,4,3,0,3054.0,489000,13683,Split Level,11,Watchung Avenue,"6 mins, 45 mins",rail,1.334357,Allwood Road,1.641766,24 mins,56 mins,GSMLS
3433347,340 Walker Rd,West Orange,Essex,3,2,1,2066.0,440000,13755,Split Level,9,Mountain Station,"5 mins, 43 mins",rail,1.151145,Irvington Bus Terminal,3.526863,8 mins,1 hour 37 mins,GSMLS
3436125,12 Ridge Ave,Little Falls,Passaic,4,3,1,3170.0,439000,13938,Colonial,10,Little Falls,"3 mins, 1 hour 4 mins",rail,0.408777,Willowbrook Mall,1.827223,24 mins,50 mins,GSMLS
3418703,35 Van Winkle Ct,Woodland Park,Passaic,3,3,0,2500.0,499900,14610,Ranch,7,Montclair State University,"5 mins, 57 mins",rail,1.174653,Allwood Road,2.679599,30 mins,1 hour 5 mins,GSMLS


**GSMLS: Null square footage, low taxes**

In [14]:
df1c = show_gsmls()
df1c = df1c[(df1c.sqft.isnull()) & (df1c.tax < 10000)]
df1c = df1c.sort_values('city')
print(len(df1c), "listings after filtering")
preview_gsmls_df(df1c)

60 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3428699,27 Ira Rd,Cedar Grove,Essex,4,2,0,,438880,7619,Colonial,10,Upper Montclair,"6 mins, 48 mins",rail,1.039111,Allwood Road,2.897116,20 mins,51 mins,GSMLS
3433324,217 Stevens Ave,Cedar Grove,Essex,3,3,1,,509000,9231,Colonial,6,Little Falls,"2 mins, 1 hour 4 mins",rail,0.42371,Willowbrook Mall,1.753137,23 mins,1 hour 4 mins,GSMLS
3428633,447 Westfield Ave,Clark,Union,4,2,1,,399999,9631,"Custom Home, Split Level",10,Garwood,"9 mins, 53 mins",rail,1.974796,Rutgers Lane Hospital - Union Twp,4.251719,29 mins,1 hour 30 mins,GSMLS
3436414,44 KATHRYN ST,Clark,Union,4,2,0,,469900,8551,"Custom Home, See Remarks",7,Rahway,"7 mins, 48 mins",rail,1.507395,Carteret,4.25612,30 mins,1 hour 23 mins,GSMLS
3428897,80 Fulton St,Clark,Union,3,2,1,,449900,8584,Ranch,8,Rahway,"7 mins, 48 mins",rail,1.516515,Rutgers Lane Hospital - Union Twp,3.911853,29 mins,1 hour 21 mins,GSMLS
3435731,42 Livingston St,Clifton,Passaic,3,2,1,,478888,7605,Colonial,9,Clifton,"7 mins, 44 mins",rail,1.562199,Allwood Road,2.143985,27 mins,55 mins,GSMLS
3435342,52 Rowland Ave,Clifton,Passaic,4,2,1,,399000,8047,Custom Home,8,Passaic,"4 mins, 40 mins",rail,0.76888,Allwood Road,1.420538,30 mins,48 mins,GSMLS
3422738,119 Edgewood Ave,Clifton,Passaic,3,2,1,,374900,9515,Colonial,6,Passaic,"4 mins, 40 mins",rail,0.830379,Clifton Commons,1.141417,29 mins,51 mins,GSMLS
3425188,21 Clay St,Clifton,Passaic,4,2,1,,389900,8905,See Remarks,7,Clifton Commons,"3 mins, 47 mins",bus,0.283583,Clifton Commons,0.283583,28 mins,46 mins,GSMLS
3409537,118 W 2nd St,Clifton,Passaic,4,3,0,,379000,7469,Colonial,9,Clifton,"6 mins, 44 mins",rail,1.40358,Passaic Bus Terminal,2.239075,27 mins,51 mins,GSMLS


**GSMLS: Null square footage, medium taxes**

In [15]:
df1d = show_gsmls()
df1d = df1d[(df1d.sqft.isnull()) & (df1d.tax > 10000) & (df1d.tax < 13500)]
df1d = df1d.sort_values('city')
print(len(df1d), "listings after filtering")
preview_gsmls_df(df1d)

53 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3400950,47 Hearthstone Rd,Bloomfield,Essex,4,2,0,,350000,12699,Split Level,10,Watchung Avenue,"6 mins, 45 mins",rail,1.459975,Allwood Road,1.638773,25 mins,57 mins,GSMLS
3425147,85 Bellevue Ter,Bloomfield,Essex,3,2,1,,424900,12668,Split Level,8,Allwood Road,"4 mins, 33 mins",bus,1.063173,Allwood Road,1.063173,27 mins,58 mins,GSMLS
3430823,6 Colony Dr,Caldwell,Essex,4,2,0,,430000,12856,Colonial,7,Little Falls,"6 mins, 1 hour 4 mins",rail,1.248499,Willowbrook Mall,1.470499,20 mins,1 hour 15 mins,GSMLS
3428051,42 KENNETH PL,Clark,Union,3,2,0,,439000,10643,Split Level,8,Garwood,"10 mins, 53 mins",rail,2.017407,Rutgers Lane Hospital - Union Twp,4.342921,29 mins,1 hour 32 mins,GSMLS
3435953,48 Godwin Pl,Clifton,Passaic,3,2,1,,399000,13426,Split Level,8,Allwood Road,"3 mins, 33 mins",bus,0.553667,Allwood Road,0.553667,27 mins,53 mins,GSMLS
3433595,1 Friar Ln,Clifton,Passaic,3,2,1,,489000,11135,Custom Home,7,Allwood Road,"4 mins, 33 mins",bus,0.84656,Allwood Road,0.84656,25 mins,44 mins,GSMLS
3401881,41 Ridgewood Rd,Clifton,Passaic,4,3,0,,399900,11114,Colonial,9,Allwood Road,"4 mins, 33 mins",bus,0.702121,Allwood Road,0.702121,28 mins,39 mins,GSMLS
3432605,790 Grove St,Clifton,Passaic,3,2,0,,499000,12971,Ranch,6,Montclair Heights,"4 mins, 52 mins",rail,0.712505,Allwood Road,0.777666,25 mins,52 mins,GSMLS
3426860,193 2nd St,Clifton,Passaic,5,2,1,,424900,10667,"Colonial, Victorian",9,Clifton,"4 mins, 44 mins",rail,0.72888,Passaic Bus Terminal,1.218341,29 mins,50 mins,GSMLS
3406313,51 Karen Dr,Clifton,Passaic,3,2,1,,423000,10009,Split Level,8,Clifton,"6 mins, 44 mins",rail,0.927682,Allwood Road,1.325885,27 mins,44 mins,GSMLS


**GSMLS: Null square footage, high taxes**

In [16]:
df1dd = show_gsmls()
df1dd = df1dd[(df1dd.sqft.isnull()) & (df1dd.tax > 13500) & (df1dd.tax < 15000)]
df1dd = df1dd.sort_values('city')
print(len(df1dd), "listings after filtering")
preview_gsmls_df(df1dd)

8 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,nyc_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3419648,13 BELLEVUE AVE,Bloomfield,Essex,3,3,0,,425000,14104,Colonial,7,Watchung Avenue,"5 mins, 45 mins",rail,1.211963,Allwood Road,1.26657,26 mins,52 mins,GSMLS
3417461,15 George St,Bloomfield,Essex,4,2,1,,439000,14719,Colonial,8,Watchung Avenue,"6 mins, 45 mins",rail,1.388071,Allwood Road,1.715933,22 mins,48 mins,GSMLS
3424669,3 Trella Ter,Clifton,Passaic,4,3,0,,499000,13539,"Custom Home, Ranch, Raised Ranch",10,Clifton,"8 mins, 44 mins",rail,1.260493,Allwood Road,1.546734,28 mins,47 mins,GSMLS
3420887,682 Union Ave,Hillside,Union,6,4,1,,489900,13602,Colonial,16,North Elizabeth,"5 mins, 32 mins",rail,0.742322,Rutgers Lane Hospital - Union Twp,2.446791,30 mins,57 mins,GSMLS
3419601,3 W Walnut St,Metuchen,Middlesex,5,2,1,,487000,13848,Colonial,7,Metuchen,"3 mins, 51 mins",rail,0.560384,Sayreville,4.991073,38 mins,1 hour 11 mins,GSMLS
3421163,201 Maple Ave,Metuchen,Middlesex,5,2,2,,500000,14096,Ranch,13,Metuchen,"3 mins, 51 mins",rail,0.447927,Sayreville,5.845925,38 mins,1 hour 16 mins,GSMLS
3424889,1561 Cooper Rd,Scotch Plains,Union,4,2,0,,449900,14690,Colonial,9,Fanwood,"6 mins, 58 mins",rail,1.365185,Watchung Park & Ride,2.50014,34 mins,1 hour 35 mins,GSMLS
3428855,41 Hardwick Ln,Wayne,Passaic,6,3,1,,500000,13940,Colonial,11,Broadway Bus Terminal - Paterson,"12 mins, 59 mins",bus,2.520993,Broadway Bus Terminal - Paterson,2.520993,36 mins,1 hour 10 mins,GSMLS


**Final NJMLS Listings**

In [17]:
df2 = pd.DataFrame(njmls_listings)
df2.set_index('id', inplace=True)
df2 = df2[
    (~df2.city.isin(blacklist_cities)) &
    (~df2.basement.str.contains('Crawlspace')) &
    (df2.basement != 'None') &
    (df2['style'] != 'Cape Cod') &
    (df2.barnabas_duration < HALF_HOUR) &
    (df2.park_and_ride_duration1 < TEN_MINUTES) &
    (df2.park_and_ride_duration < ONE_HOUR_TEN_MINUTES)
]
df2 = df2.sort_values(by=['city',], ascending=[True])
print(len(df2), "listings after filtering")
preview_njmls_df(df2)

55 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,price,tax,style,rooms,list_date,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1735422,338 Essex Avenue,Bloomfield,Essex,5,2,1,439900,16232.0,Colonial,9,09/01/2017,Walnut Street,"5 mins, 42 mins",rail,0.751703,Allwood Road,2.776278,24 mins,NJMLS
1741304,85 Bellevue Terrace,Bloomfield,Essex,3,2,1,424900,12668.0,Split Level,8,10/13/2017,Allwood Road,"4 mins, 33 mins",bus,1.063173,Allwood Road,1.063173,27 mins,NJMLS
1736975,15 George Street,Bloomfield,Essex,4,2,1,439000,14719.0,Colonial,8,09/12/2017,Watchung Avenue,"6 mins, 45 mins",rail,1.390126,Allwood Road,1.715707,22 mins,NJMLS
1742512,10 Claremont Avenue,Bloomfield,Essex,4,2,2,479800,16663.0,Colonial,11,10/18/2017,Watchung Avenue,"6 mins, 45 mins",rail,1.176241,Allwood Road,1.444312,25 mins,NJMLS
1746287,175 Jerome Place,Bloomfield,Essex,4,3,0,459000,8715.0,Colonial,8,11/28/2017,Bloomfield,"5 mins, 36 mins",rail,0.632988,Clifton Commons,3.454818,21 mins,NJMLS
1727036,47 Hearthstone Road,Bloomfield,Essex,4,2,0,350000,12699.0,Split Level,10,06/30/2017,Watchung Avenue,"6 mins, 45 mins",rail,1.459975,Allwood Road,1.638773,25 mins,NJMLS
1748219,151 Sadler Road,Bloomfield,Essex,3,2,0,370000,11603.0,Colonial,7,12/17/2017,Clifton Commons,"8 mins, 47 mins",bus,1.67228,Clifton Commons,1.67228,23 mins,NJMLS
1748481,89 Mountain Avenue,Bloomfield,Essex,3,2,0,399000,13884.0,Colonial,6,12/21/2017,Watchung Avenue,"6 mins, 45 mins",rail,1.141127,Allwood Road,1.494435,25 mins,NJMLS
1744849,6 Colony Drive,Caldwell,Essex,4,2,0,430000,12856.0,Colonial,7,11/13/2017,Little Falls,"6 mins, 1 hour 4 mins",rail,1.248499,Willowbrook Mall,1.470499,20 mins,NJMLS
1701438,70 Devonshire Road,Cedar Grove,Essex,4,2,1,399000,9420.0,Split Level,8,01/12/2017,Upper Montclair,"4 mins, 48 mins",rail,0.849114,Allwood Road,2.706911,22 mins,NJMLS


In [18]:
# set([x['https://www.njmls.com/listings/index.cfm?action=dsp.info&mlsnum=1719260city'] for x in gsmls_listings])

In [19]:
# df = pd.DataFrame([x for x in gsmls_listings if x['city'] in [
#     'Edison',
#     'Elizabeth', 'Woodbridge Township', 'Perth Amboy', 'South Amboy', 'Dunellen', 'Dover',
#     'Matawan', 'South Plainfield', 'Piscataway']])
# df.set_index('id', inplace=True)
# df.sort_values('city', inplace=True)
# print(len(df))
# preview_gsmls_df(df)

In [20]:
# df = pd.DataFrame([x for x in njmls_listings if x['city'] in [
#     'Elizabeth', 'Woodbridge Township', 'Perth Amboy', 'South Amboy', 
#     'Matawan', 'South Plainfield', 'Piscataway']])
# df.set_index('id', inplace=True)
# print(len(df))
# preview_njmls_df(df)

In [21]:
# preview_gsmls_df(pd.DataFrame([x for x in gsmls_listings if x['sqft'] and 3000 < x['sqft'] < 4000]).set_index('id').sort_values(by='barnabas_duration'))