**TODO**
- New bus lines?

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from IPython.core.display import display, HTML
from concurrent.futures import ThreadPoolExecutor

pd.set_option('display.max_rows', None)

def preview_gsmls_df(mydf, save=False):
    columns = [
        'address', 'city', 'county', 'bedrooms', 'baths_full', 'baths_part',
        'sqft', 'price', 'tax', 'style', 'rooms', 'park_and_ride_name',
        'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance',
        'park_and_ride_bus_name', 'park_and_ride_bus_distance',
        'barnabas_duration_text', 'source']
    if save:
        mydf[columns].to_csv('gsmls.csv')
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(
            formatters={
                '__index__':
                    lambda id: f'<a target="_blank" href="https://gsmls.herokuapp.com/properties/{id}">{id}</a>'},
                        escape=False)))

def preview_njmls_df(mydf, save=False):
    columns = ['address', 'city', 'county', 'bedrooms', 'baths_full', 'baths_part',
               'price', 'tax', 'style', 'rooms', 'list_date', 'park_and_ride_name',
               'park_and_ride_duration_text', 'park_and_ride_type', 'park_and_ride_distance',
               'park_and_ride_bus_name', 'park_and_ride_bus_distance',
               'barnabas_duration_text', 'source']
    if save:
        mydf[columns].to_csv('njmls.csv')
    display(HTML(
        """<style>.dataframe td {white-space: nowrap;}</style>""" +
        mydf[columns].to_html(
            formatters={
                '__index__':
                    lambda id: f'<a target="_blank" href="https://www.njmls.com/listings/index.cfm?action=dsp.info&mlsnum={id}">{id}</a>'},
            escape=False)))


**Define global parameters**

In [3]:
counties = ['Passaic', 'Hudson','Essex', 'Middlesex', 'Morris', 'Bergen', 'Union',]

ONE_HOUR = 3600
TEN_MINUTES = ONE_HOUR/6
HALF_HOUR = ONE_HOUR / 2
ONE_HOUR_TEN_MINUTES = ONE_HOUR + TEN_MINUTES
ONE_HOUR_THIRTY_MINUTES = ONE_HOUR + HALF_HOUR
ONE_HOUR_FORTY_MINUTES = ONE_HOUR + HALF_HOUR + TEN_MINUTES

# https://censusreporter.org/

# City of Orange: https://censusreporter.org/profiles/06000US3401313045-city-of-orange-township-essex-county-nj/
# East Orange: https://censusreporter.org/profiles/16000US3419390-east-orange-nj/
# South Orange: https://censusreporter.org/profiles/06000US3401369274-south-orange-village-township-essex-county-nj/
# West Orange: https://censusreporter.org/profiles/06000US3401379800-west-orange-township-essex-county-nj/

blacklist_cities = [
    'Paterson', 'East Orange', 'City of Orange', 'Belleville', 'Elizabeth',
    'Jefferson', 'Passaic', 'Newark', 'East Newark', 'Linden', 'Nutley',
    'Garfield',]

**Download listings from [GSMLS](https://www.gsmls.com/)**

In [4]:
from gsmls import get_listings

gsmls_listings = []
for county in tqdm(counties):

    gsmls_listings += get_listings(county,
                           min_list_price=400000,
                           max_list_price=500000,
                           min_bedrooms=3,
                           min_bathrooms=2)

print(f"Downloaded {len(gsmls_listings)} listings from GSMLS.")
gsmls_listings = [x for x in gsmls_listings if x['address']]
print(f"Filtered down to {len(gsmls_listings)} listings due to null address.")

for listing in gsmls_listings:
    listing['source'] = 'GSMLS'


Downloaded 596 listings from GSMLS.
Filtered down to 565 listings due to null address.


**Download listings from [NJMLS](http://www.njmls.com/)**

In [5]:
from njmls import get_listings, get_listing_detail

def get_listing_detail_wrapper(listing):
    listing_detail = get_listing_detail(listing['id'])
    listing_detail['lat'] = listing['lat']
    listing_detail['lng'] = listing['lng']
    return listing_detail
    
njmls_listings = []
for county in tqdm(counties):
    current_listings = list(get_listings(
            min_beds=3,
            min_baths=2,
            county_search=True,
            min_price=400000,
            max_price=500000,
            counties=[county.upper()],
            proptypes=['1']))

    listing_ids = [x['id'] for x in current_listings]
    listings_dict = {x['id']:x for x in current_listings}

    with ThreadPoolExecutor(max_workers=50) as e:
        njmls_listings += tqdm(e.map(get_listing_detail_wrapper, current_listings), total=len(current_listings))

print(f"Downloaded {len(njmls_listings)} listings from NJMLS.")

njmls_listings = [x for x in njmls_listings if x['address']]
print(f"Filtered down to {len(njmls_listings)} listings due to null address.")

for listing in njmls_listings:
    listing['source'] = 'NJMLS'


Downloaded 341 listings from NJMLS.
Filtered down to 329 listings due to null address.


In [6]:
listings = gsmls_listings + njmls_listings

**Geocode each address using Google Maps API**

In [7]:
from geolocate import geocode
for listing in tqdm(listings):
    if not listing.get('address'): continue
    source = listing['address'] + ' ' + listing['city'] + ', ' + 'NJ'
    geocoded = geocode(source)
    if len(geocoded) == 0:
        source2 = listing['address'] + ' ' + listing['city'].replace(' City', '').replace(' Boro Twp.', '').replace(' Boro', '') + ', ' + 'NJ'
        geocoded = geocode(source2)
        if len(geocoded) == 0:
            print(f"could not geocode: {source} or {source2}")
            continue
    listing['lat'] = geocoded[0]['geometry']['location']['lat']
    listing['lng'] = geocoded[0]['geometry']['location']['lng']
    listing['formatted_address'] = geocoded[0]['formatted_address']
    try:
        listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'locality' in x['types']][0]
    except IndexError:
        try:
            listing['city'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_3' in x['types']][0]
        except IndexError:
            print("could not find city for", listing['formatted_address'], geocoded[0]['address_components'])
    try:
        listing['county'] = [x['long_name'] for x in geocoded[0]['address_components'] if 'administrative_area_level_2' in x['types']][0].replace(' County', '')
    except IndexError:
        print("could not find county for", listing['source'], listing['id'], listing['formatted_address'], geocoded[0]['address_components'])

could not find county for GSMLS 3426835 46 Veranda Ave, North Caldwell, NJ, USA [{'long_name': '46', 'short_name': '46', 'types': ['street_number']}, {'long_name': 'Veranda Avenue', 'short_name': 'Veranda Ave', 'types': ['route']}, {'long_name': 'North Caldwell', 'short_name': 'North Caldwell', 'types': ['locality', 'political']}, {'long_name': 'New Jersey', 'short_name': 'NJ', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}]
could not geocode: 2 Shadetree Lane Mount Arlington Boro, NJ or 2 Shadetree Lane Mount Arlington, NJ
could not geocode: 15 Shadetree Ln Mount Arlington Boro, NJ or 15 Shadetree Ln Mount Arlington, NJ
could not find county for NJMLS 1742511 46 Veranda Ave, North Caldwell, NJ, USA [{'long_name': '46', 'short_name': '46', 'types': ['street_number']}, {'long_name': 'Veranda Avenue', 'short_name': 'Veranda Ave', 'types': ['route']}, {'long_name': 'North Caldwell', 'short_name'

**Find closest NJ Transit Park and Ride to each address**

In [8]:
from geopy.distance import vincenty

with open('park_and_rides.json') as f:
    park_and_rides = json.load(f)

# TODO: find the closest BUS vs TRAIN park and ride separately?

for i in tqdm(range(len(listings))):
    listing = listings[i]
    if not listing.get('lat') or not listing.get('lng'): continue
    distances = [vincenty(pr['location'], (listing['lat'], listing['lng'],)).miles for pr in park_and_rides]
    closest_index = np.argmin(distances)
    listing['park_and_ride'] = park_and_rides[closest_index]
    listing['park_and_ride_name'] = park_and_rides[closest_index]['name']
    listing['park_and_ride_type'] = park_and_rides[closest_index]['type']
    listing['park_and_ride_distance'] = distances[closest_index]
    
    park_and_rides_bus = [x for x in park_and_rides if x['type'] == 'bus']
    distances_bus = [vincenty(pr['location'], (listing['lat'], listing['lng'],)).miles for pr in park_and_rides_bus]
    closest_index_bus = np.argmin(distances_bus)
    listing['park_and_ride_bus'] = park_and_rides_bus[closest_index_bus]
    listing['park_and_ride_bus_name'] = park_and_rides_bus[closest_index_bus]['name']
    listing['park_and_ride_bus_type'] = park_and_rides_bus[closest_index_bus]['type']
    listing['park_and_ride_bus_distance'] = distances_bus[closest_index_bus]




**Get commute time for each address through both park and ride and walking**

In [9]:
from geolocate import get_directions, get_driving_directions

for i in tqdm(range(len(listings))):
    listing = listings[i]
    if not listing.get('formatted_address'): continue

    # get walking/transit destination from source to port authority
    source = listing['formatted_address']
    destination = 'Port Authority Bus Terminal'
    try:
        directions = get_directions(source, destination, mode='transit')
    except Exception as e:
        pass
    else:
        listing['nyc_duration'] = directions['duration']['value'] if directions else None
        listing['nyc_duration_text'] = directions['duration']['text'] if directions else None
        listing['nyc_instructions'] = directions['instructions'] if directions else None
    
    # get walking/transit destination from source to 94 Old Short Hills Road, Livingston, NJ
    source = listing['formatted_address']
    destination = '94 Old Short Hills Road, Livingston, NJ'
    try:
        directions = get_directions(source, destination, mode='driving')
    except Exception as e:
        pass
    else:
        listing['barnabas_duration'] = directions['duration']['value'] if directions else None
        listing['barnabas_duration_text'] = directions['duration']['text'] if directions else None
        listing['barnabas_instructions'] = directions['instructions'] if directions else None

    if listing.get('park_and_ride'):

        # calculate time from home -> park and ride
        source = listing['formatted_address']
        destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='driving')
        if directions is None:
            print(f"unable to find driving directions from home ({source}) to park and ride ({destination})")
        listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

        # calculate time from home -> park and ride
        source = listing['formatted_address']
        destination = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='driving')
        if directions is None:
            print(f"unable to find driving directions from home ({source}) to park and ride ({destination})")
        listing['park_and_ride_duration1'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text1'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions1'] = directions['instructions'] if directions else None

        # calculate time from park and ride -> NY
        if listing['park_and_ride']['type'] == 'rail':
            destination = 'New York Penn Station'
        else:
            destination = 'Port Authority Bus Terminal'
        source = str(tuple(listing['park_and_ride']['location']))[1:-1]
        directions = get_directions(source, destination, mode='transit')
        if directions is None:
            print(f"unable to find transit directions from park and ride ({source}) to NY ({destination})")
        listing['park_and_ride_duration2'] = directions['duration']['value'] if directions else None
        listing['park_and_ride_duration_text2'] = directions['duration']['text'] if directions else None
        # listing['park_and_ride_instructions2'] = directions['instructions'] if directions else None
    
        if not listing.get('park_and_ride_duration1') or not listing.get('park_and_ride_duration2'):
            print(listing)

        # sum total time from home -> NY
        listing['park_and_ride_duration'] = listing['park_and_ride_duration1'] + listing['park_and_ride_duration2']
        listing['park_and_ride_duration_text'] = ', '.join([listing['park_and_ride_duration_text1'], listing['park_and_ride_duration_text2']])

print(sum(1 for listing in listings if listing.get('park_and_ride_duration')),
      "out of", len(listings), "have park and ride commute times.")

print(sum(1 for listing in listings if listing.get('nyc_duration')),
  "out of", len(listings), "have walking commute times.")


891 out of 894 have park and ride commute times.
756 out of 894 have walking commute times.


In [10]:
gsmls_filters = lambda x: ((x.address.notnull()) &
                            (~x.city.isin(blacklist_cities)) &
                            (x.barnabas_duration < HALF_HOUR) &
                            (x.park_and_ride_duration1 < TEN_MINUTES) &
                            (x.park_and_ride_duration < ONE_HOUR_TEN_MINUTES) &
                            (~x['style'].str.contains('Townhouse', case=False)) &
                            (~x['style'].str.contains('Bi-Level', case=False)) &
                            (~x['style'].str.contains('Cape Cod', case=False)) &
                            (~x.heat_source.str.contains('oil', case=False, na=True)) &
                            (~x.water.str.contains('well', case=False, na=True)) &
                            (~x.sewer.str.contains('septic', case=False, na=True)))

def show_gsmls():
    df = pd.DataFrame(gsmls_listings)
    df.set_index('id', inplace=True)
    df = df[gsmls_filters(df)]
    df = df.sort_values(by=['park_and_ride_duration', 'park_and_ride_distance'], ascending=[True, True])
    return df

**GSMLS: Large square footage, low taxes**

In [11]:
df1a = show_gsmls()
df1a = df1a[(df1a.sqft > 2000) & (df1a.tax < 10000)]
df1a = df1a.sort_values('tax')
print(len(df1a), "listings after filtering")
preview_gsmls_df(df1a)

4 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3399931,1308 WICKHAM TER,Clifton,Passaic,3,2,0,2083.0,499900,8615,"Custom Home, Multi Floor Unit",7,Clifton,"5 mins, 44 mins",rail,0.733538,Allwood Road,1.542001,30 mins,GSMLS
3424306,980 Moessner Ave,Union,Union,5,3,0,2637.0,429000,8911,Colonial,10,Springfield Center - Springfield Twp,"4 mins, 1 hour 3 mins",bus,0.939318,Springfield Center - Springfield Twp,0.939318,16 mins,GSMLS
3426177,150 Patricia Pl,Clifton,Passaic,4,2,0,5001.0,400000,9162,Ranch,5,Clifton Commons,"5 mins, 47 mins",bus,0.657984,Clifton Commons,0.657984,28 mins,GSMLS
3435172,6 Boyden Ave,Maplewood,Essex,4,3,1,3724.0,490000,9971,Detached,9,South Orange,"7 mins, 36 mins",rail,1.054738,Irvington Bus Terminal,1.213625,17 mins,GSMLS


**GSMLS: Large square footage, medium taxes**

In [12]:
df1b = show_gsmls()
df1b = df1b[(df1b.sqft > 2000) & (df1b.tax > 10000) & (df1b.tax < 13500)]
df1b = df1b.sort_values('tax')
print(len(df1b), "listings after filtering")
preview_gsmls_df(df1b)

4 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3434419,157 Indian Run Pkwy,Union,Union,4,4,0,2800.0,499000,10477,"Colonial, Custom Home",11,Union Center - Union Twp,"4 mins, 47 mins",bus,0.639539,Union Center - Union Twp,0.639539,23 mins,GSMLS
3433211,126 Linwood Ter,Clifton,Passaic,3,3,0,2200.0,489500,11067,Colonial,7,Allwood Road,"5 mins, 33 mins",bus,0.897368,Allwood Road,0.897368,27 mins,GSMLS
3430057,7 Robinwood Drive,Little Falls,Passaic,5,3,1,3080.0,499900,12758,Colonial,9,Montclair State University,"6 mins, 57 mins",rail,1.020244,Allwood Road,2.453861,25 mins,GSMLS
3374522,35 Morse Ave,Bloomfield,Essex,7,4,0,2550.0,400000,13028,Victorian,11,Watsessing Avenue,"5 mins, 33 mins",rail,0.555766,Irvington Bus Terminal,4.160248,21 mins,GSMLS


**GSMLS: Large square footage, high taxes**

In [13]:
df1bb = show_gsmls()
df1bb = df1bb[(df1bb.sqft > 2000) & (df1bb.tax > 13500) & (df1bb.tax < 15000)]
df1bb = df1bb.sort_values('tax')
print(len(df1bb), "listings after filtering")
preview_gsmls_df(df1bb)

4 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3434778,79 Winding Ln,Bloomfield,Essex,4,3,0,3054.0,489000,13683,Split Level,11,Watchung Avenue,"6 mins, 45 mins",rail,1.334357,Allwood Road,1.641766,24 mins,GSMLS
3433347,340 Walker Rd,West Orange,Essex,3,2,1,2066.0,440000,13755,Split Level,9,Mountain Station,"5 mins, 43 mins",rail,1.151145,Irvington Bus Terminal,3.526863,8 mins,GSMLS
3418468,12 Ridge Ave,Little Falls,Passaic,4,3,1,3170.0,439900,13938,Colonial,10,Little Falls,"3 mins, 1 hour 4 mins",rail,0.408777,Willowbrook Mall,1.827223,24 mins,GSMLS
3418703,35 Van Winkle Ct,Woodland Park,Passaic,3,3,0,2500.0,499900,14610,Ranch,7,Montclair State University,"5 mins, 57 mins",rail,1.174653,Allwood Road,2.679599,30 mins,GSMLS


**GSMLS: Null square footage, low taxes**

In [14]:
df1c = show_gsmls()
df1c = df1c[(df1c.sqft.isnull()) & (df1c.tax < 10000)]
df1c = df1c.sort_values('tax')
print(len(df1c), "listings after filtering")
preview_gsmls_df(df1c)

18 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3428229,983 Riverview Dr N,Totowa,Passaic,5,2,1,,499000,0,Raised Ranch,9,Wayne/Route 23 Transit Center,"4 mins, 53 mins",rail,0.998292,Wayne/Route 23 Transit Center,0.998292,26 mins,GSMLS
3432978,256 Whippany Rd,Hanover,Morris,4,2,0,,499900,6146,Colonial,8,Convent Station,"8 mins, 1 hour 0 mins",rail,2.200325,Springfield Center - Springfield Twp,9.305718,20 mins,GSMLS
3431872,79 Tooker Ave,Springfield Township,Union,3,2,0,,419000,6900,"1/2 Duplex, Multi Floor Unit",6,Springfield Center - Springfield Twp,"2 mins, 1 hour 3 mins",bus,0.406819,Springfield Center - Springfield Twp,0.406819,17 mins,GSMLS
3435828,731 Palisade Rd,Union,Union,4,3,0,,499900,7177,Colonial,9,Union,"4 mins, 41 mins",rail,0.422754,Rutgers Lane Hospital - Union Twp,1.552985,30 mins,GSMLS
3435428,1251 Grandview Ave,Union,Union,4,2,1,,424900,7420,Expanded Ranch,9,Union Center - Union Twp,"4 mins, 47 mins",bus,0.282648,Union Center - Union Twp,0.282648,22 mins,GSMLS
3435731,42 Livingston St,Clifton,Passaic,3,2,1,,478888,7605,Colonial,9,Clifton,"7 mins, 44 mins",rail,1.562199,Allwood Road,2.143985,27 mins,GSMLS
3428699,27 Ira Rd,Cedar Grove,Essex,4,2,0,,438880,7619,Colonial,10,Upper Montclair,"6 mins, 48 mins",rail,1.039111,Allwood Road,2.897116,20 mins,GSMLS
3434491,319 Boulevard,Kenilworth,Union,3,3,0,,425000,7624,Split Level,8,Cranford,"8 mins, 44 mins",rail,1.344642,Rutgers Lane Hospital - Union Twp,1.790486,23 mins,GSMLS
3434442,31 Menzel Ave,Maplewood,Essex,3,2,1,,425000,7919,Colonial,10,Maplewood,"7 mins, 31 mins",rail,1.390182,Pine Avenue - Union Twp,1.503745,19 mins,GSMLS
3433969,2579 Hamilton Ter,Union,Union,3,2,1,,410000,7979,Colonial,8,Pine Avenue - Union Twp,"6 mins, 55 mins",bus,1.236177,Pine Avenue - Union Twp,1.236177,17 mins,GSMLS


**GSMLS: Null square footage, medium taxes**

In [15]:
df1d = show_gsmls()
df1d = df1d[(df1d.sqft.isnull()) & (df1d.tax > 10000) & (df1d.tax < 13500)]
df1d = df1d.sort_values('city')
print(len(df1d), "listings after filtering")
preview_gsmls_df(df1d)

36 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3425147,85 Bellevue Ter,Bloomfield,Essex,3,2,1,,424900,12668,Split Level,8,Allwood Road,"4 mins, 33 mins",bus,1.063173,Allwood Road,1.063173,27 mins,GSMLS
3433750,70 Golf Rd,Bloomfield,Essex,3,2,1,,449000,12340,Split Level,7,Allwood Road,"4 mins, 33 mins",bus,1.062829,Allwood Road,1.062829,26 mins,GSMLS
3430823,6 Colony Dr,Caldwell,Essex,4,2,0,,430000,12856,Colonial,7,Little Falls,"6 mins, 1 hour 4 mins",rail,1.248499,Willowbrook Mall,1.470499,20 mins,GSMLS
3428051,42 KENNETH PL,Clark,Union,3,2,0,,439000,10643,Split Level,8,Garwood,"10 mins, 53 mins",rail,2.017407,Rutgers Lane Hospital - Union Twp,4.342921,29 mins,GSMLS
3433595,1 Friar Ln,Clifton,Passaic,3,2,1,,489000,11135,Custom Home,7,Allwood Road,"4 mins, 33 mins",bus,0.84656,Allwood Road,0.84656,25 mins,GSMLS
3414582,114 Allwood Pl,Clifton,Passaic,4,2,0,,499000,11345,"Custom Home, Ranch",8,Passaic,"4 mins, 40 mins",rail,0.769685,Clifton Commons,1.376629,30 mins,GSMLS
3432605,790 Grove St,Clifton,Passaic,3,2,0,,499000,12971,Ranch,6,Montclair Heights,"4 mins, 52 mins",rail,0.712505,Allwood Road,0.777666,25 mins,GSMLS
3426860,193 2nd St,Clifton,Passaic,5,2,1,,424900,10667,"Colonial, Victorian",9,Clifton,"4 mins, 44 mins",rail,0.72888,Passaic Bus Terminal,1.218341,29 mins,GSMLS
3412525,44 Barkley Ave,Clifton,Passaic,3,2,1,,419000,10973,"Duplex, 1/2 Duplex",7,Clifton,"5 mins, 44 mins",rail,0.878032,Passaic Bus Terminal,1.504168,29 mins,GSMLS
3406313,51 Karen Dr,Clifton,Passaic,3,2,1,,423000,10009,Split Level,8,Clifton,"6 mins, 44 mins",rail,0.927682,Allwood Road,1.325885,27 mins,GSMLS


**GSMLS: Null square footage, high taxes**

In [16]:
df1dd = show_gsmls()
df1dd = df1dd[(df1dd.sqft.isnull()) & (df1dd.tax > 13500) & (df1dd.tax < 15000)]
df1dd = df1dd.sort_values('city')
print(len(df1dd), "listings after filtering")
preview_gsmls_df(df1dd)

5 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,sqft,price,tax,style,rooms,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3419648,13 BELLEVUE AVE,Bloomfield,Essex,3,3,0,,425000,14104,Colonial,7,Watchung Avenue,"5 mins, 45 mins",rail,1.211963,Allwood Road,1.26657,26 mins,GSMLS
3417461,15 George St,Bloomfield,Essex,4,2,1,,439000,14719,Colonial,8,Watchung Avenue,"6 mins, 45 mins",rail,1.388071,Allwood Road,1.715933,22 mins,GSMLS
3424669,3 Trella Ter,Clifton,Passaic,4,3,0,,499000,13539,"Custom Home, Ranch, Raised Ranch",10,Clifton,"8 mins, 44 mins",rail,1.260493,Allwood Road,1.546734,28 mins,GSMLS
3435400,136 Commonwealth Ave,New Providence,Union,3,2,0,,469000,14576,Ranch,9,New Providence,"5 mins, 1 hour 4 mins",rail,0.765485,Springfield Center - Springfield Twp,4.639458,19 mins,GSMLS
3434254,18 Sunset Ave,Verona,Essex,5,2,1,,479000,14880,Colonial,10,Walnut Street,"7 mins, 42 mins",rail,1.131817,Allwood Road,3.457023,16 mins,GSMLS


**Final NJMLS Listings**

In [17]:
df2 = pd.DataFrame(njmls_listings)
df2.set_index('id', inplace=True)
df2 = df2[
    (~df2.city.isin(blacklist_cities)) &
    (~df2.basement.str.contains('Crawlspace')) &
    (df2.basement != 'None') &
    (df2['style'] != 'Cape Cod') &
    (df2.barnabas_duration < HALF_HOUR) &
    (df2.park_and_ride_duration1 < TEN_MINUTES) &
    (df2.park_and_ride_duration < ONE_HOUR_TEN_MINUTES)
]
df2 = df2.sort_values(by=['tax',], ascending=[True])
print(len(df2), "listings after filtering")
preview_njmls_df(df2)

36 listings after filtering


Unnamed: 0_level_0,address,city,county,bedrooms,baths_full,baths_part,price,tax,style,rooms,list_date,park_and_ride_name,park_and_ride_duration_text,park_and_ride_type,park_and_ride_distance,park_and_ride_bus_name,park_and_ride_bus_distance,barnabas_duration_text,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1744163,17 Mozart Avenue,Little Falls,Passaic,4,3,0,410000,7018.0,Colonial,10,11/06/2017,Willowbrook Mall,"7 mins, 47 mins",bus,0.589252,Willowbrook Mall,0.589252,24 mins,NJMLS
1738325,92 Union Place,North Arlington,Bergen,3,2,1,465000,7233.0,Contemporary,10,09/21/2017,Kingsland,"9 mins, 31 mins",rail,2.020837,Clifton Commons,2.973799,29 mins,NJMLS
1743595,27 Ira Road,Cedar Grove,Essex,4,2,0,438880,7619.0,Colonial,10,11/02/2017,Upper Montclair,"6 mins, 48 mins",rail,1.039111,Allwood Road,2.897116,20 mins,NJMLS
1728221,25 Riverview Drive,Wayne,Passaic,3,2,1,475000,8372.0,Split Level,7,07/07/2017,Wayne/Route 23 Transit Center,"4 mins, 53 mins",rail,1.214078,Wayne/Route 23 Transit Center,1.214078,28 mins,NJMLS
1746287,175 Jerome Place,Bloomfield,Essex,4,3,0,459000,8715.0,Colonial,8,11/28/2017,Bloomfield,"5 mins, 36 mins",rail,0.632988,Clifton Commons,3.454818,21 mins,NJMLS
1745902,26 Knollwood Road,Totowa,Passaic,3,2,1,474000,8893.0,Colonial,7,11/21/2017,Little Falls,"6 mins, 1 hour 4 mins",rail,1.522317,Wayne/Route 23 Transit Center,1.873463,27 mins,NJMLS
1719260,38 Hillman Drive,Elmwood Park,Bergen,3,2,1,500000,9177.0,Colonial,6,05/13/2017,Plauderville,"7 mins, 36 mins",rail,1.365767,Passaic Bus Terminal,2.310516,29 mins,NJMLS
1746703,1083 Bloomfield Avenue,West Caldwell,Essex,5,3,1,427000,9404.0,Colonial,12,12/01/2017,Willowbrook Mall,"10 mins, 47 mins",bus,3.302753,Willowbrook Mall,3.302753,19 mins,NJMLS
1730324,51 Karen Drive,Clifton,Passaic,3,2,1,423000,10009.0,Split Level,8,07/24/2017,Clifton,"6 mins, 44 mins",rail,0.927682,Allwood Road,1.325885,27 mins,NJMLS
1741810,29 Cadmus Avenue,Elmwood Park,Bergen,4,2,0,475000,10161.0,Ranch,6,10/18/2017,Plauderville,"5 mins, 36 mins",rail,0.84094,Passaic Bus Terminal,2.320759,29 mins,NJMLS
