In [7]:
import pandas as pd
import numpy as np

In [None]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 200

In [None]:
df=pd.read_csv('data/nwmls_dump_active.csv')

In [None]:
# drop unnecessary columns
df = df.drop(["lead_image","flyprice","original_price","sold_date","max_hoa","mls_office_id","member_id",
           "house_number","directional_prefix","directional_suffix","directional_suffix","street_suffix","unit",
          "zip_plus_4","publish_to_internet","show_address_to_public","show_map_link_to_public","block","open_house_soon_start",
          "open_house_soon_end","commission","comments","offer_details","pending_date","selling_date","selling_price"], axis=1)

In [None]:
# define neighborhood based on Seattle City Neihborhood Boundaries
from __future__ import division
import pandas as pd
import numpy as np
from matplotlib.patches import Polygon
import matplotlib
from matplotlib.collections import PatchCollection
import mpl_toolkits
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon
from shapely.prepared import prep
from descartes import PolygonPatch
import matplotlib.pyplot as plt
from collections import defaultdict

% matplotlib inline

In [None]:
def plot_prepper(m, shapefilename, df_key):
    '''
    Generate dataframe for plotting
    '''
    _out = m.readshapefile(shapefilename, name='seattle', drawbounds=False, color='none', zorder=2)
    # set up a map dataframe for neighborhood outlines
    cent_lons = [float((Polygon(points).centroid.wkt).split()[1][1:]) for points in m.seattle]
    cent_lats = [float((Polygon(points).centroid.wkt).strip(')').split()[2]) for points in m.seattle]

    df_poly = pd.DataFrame({
        'poly': [Polygon(points) for points in m.seattle],
        'name': [item[df_key] for item in m.seattle_info],
    })
    df_poly['centroid_1'] = cent_lons
    df_poly['centroid_2'] = cent_lats
    return _out, df_poly

# Read in neighborhood shape file
hood_shapefilename = 'neighborhoods/WGS84/Neighborhoods'
hood_coords = (-122.43595589761988, 47.49551362397606, 
               -122.23604445458412, 47.73416494684497)

w, h = hood_coords[2] - hood_coords[0], hood_coords[3] - hood_coords[1]
extra = 0.005

# Initialize the plot
figwidth = 12
fig = plt.figure(figsize=(figwidth, figwidth*h/w))
ax = fig.add_subplot(111, axisbg='w', frame_on=False)
m = Basemap(projection='tmerc', ellps='WGS84',
            lon_0=np.mean([hood_coords[0], hood_coords[2]]),
            lat_0=np.mean([hood_coords[1], hood_coords[3]]),
            llcrnrlon=hood_coords[0] - extra * w,
            llcrnrlat=hood_coords[1] - (extra * h),
            urcrnrlon=hood_coords[2] + extra * w,
            urcrnrlat=hood_coords[3] + (extra * h),
            resolution='i',  suppress_ticks=True)

# Get dataframe for plotting
_out, df_map = plot_prepper(m, hood_shapefilename, 'S_HOOD')
nbr_names = df_map['name'].unique()

# Draw neighborhoods with grey outlines
df_map['patches'] = df_map['poly'].map(lambda x: PolygonPatch(x, ec='#050505', lw=.8, alpha=1., zorder=4))
pc = PatchCollection(df_map['patches'], match_original=True)
pc.set_facecolor('w')
ax.add_collection(pc)

# Scale the map
m.drawmapscale(hood_coords[0] + 0.08, hood_coords[1] + -0.01, 
               hood_coords[0], hood_coords[1], 10., barstyle='fancy',
               fillcolor1='w', fillcolor2='#555555', zorder=5, ax=ax,)

lons = df.longitude
lats = df.latitude

pt = []
xs = []
ys = []

'''
This plots all the latitude/longitude points
'''
for mapped_x, mapped_y in zip(lons, lats):
    xpt, ypt = m(mapped_x, mapped_y)
    plt.plot(xpt, ypt, 'v', markersize=5, alpha=0.3, color='#483070')
    pt.append(Point(xpt, ypt))
    xs.append(xpt)
    ys.append(ypt)

Labels = []
nmlist = []

'''
Checks the neighborhood of point
'''
# Creates list of neighborhoods
for polygon in df_map['poly']:
    nm = df_map.loc[df_map['poly'] == polygon, 'name'].iloc[0]
    if len(nm.strip()) < 4 or nm in nmlist:
        continue
    nmlist.append(nm)

"""
Determines neighborhood of point
"""
for i, p in enumerate(pt):
    # Create Unknown placeholder to capture points no in a neighborhood
    Labels.append('Unknown')
    # Tracks progress
    if i % 50000 == 0:
        print i
    # Creates polygon and if pt in polygon records neighborhood name
    for j, row in df_map.iterrows():
        polygon = row['poly']
        nm = row[0]
        if polygon.contains(p):
            Labels[i] = nm

# Add neighborhood names
nml = []
for i, lon in enumerate(df_map['centroid_1'].values):
    if df_map['name'].values[i] in nml:
        continue
    nml.append(df_map['name'].values[i])
    if len(df_map['name'].values[i]) > 3:
        plt.text(lon, df_map['centroid_2'].values[i], df_map['name'].values[i], size=7, ha='center')

ax.axes.get_xaxis().set_visible(False)
#plt.savefig('../images/seattle_911_neighborhoods.png')

In [None]:
df.shape

In [None]:
len(Labels)

In [None]:
df['city_neighborhood'] = Labels

In [None]:
df=df[df['city_neighborhood']!='Unknown']

In [None]:
df.shape

In [None]:
df.info()

In [None]:
lats=df['latitude']
lons=df['longitude']
import gmplot
gmap = gmplot.GoogleMapPlotter(47.608013, -122.335167, 12)
gmap.scatter(lats.values, lons.values, '#A600FF', size=150, marker=False)
gmap.draw("mymap.html")

![listing_map](../img/listing_map.png)

In [None]:
city_street_dict = {'Adams': 'Ballard',
 'Alki': 'Alki',
 'Arbor Heights': 'Arbor Heights',
 'Atlantic': 'Atlantic',
 'Belltown': 'Belltown',
 'Briarcliff': 'Magnolia',
 'Briarcrest': 'Briarcrest',
 'Brighton': 'Beacon Hill',
 'Broadview Bitter Lake': 'Broadview Bitter Lake',
 'Broadview': 'Broadview / Bitter Lake',
 'Bitter Lake': 'Broadview / Bitter Lake',
 'Broadway': 'Broadway (Capitol Hill)',
 'Cedar Park': 'Cedar Park / Meadowbrook',
 'Meadowbrook': 'Cedar Park / Meadowbrook',
 'Central Business District': 'Downtown (Central Business District)',
 'Columbia City': 'Columbia City',
 'Crown Hill': 'Phinney Ridge / Greenwood',
 'Dunlap': 'Rainier Beach',
 'East Queen Anne': 'East Queen Anne',
 'Eastlake': 'Eastlake',
 'Fairmount Park': 'Fairmount Park',
 'Fauntleroy': 'Fauntleroy',
 'First Hill': 'Downtown (Central Business District)',
 'Fremont': 'Fremont',
 'Gatewood': 'Gatewood',
 'Genesee': 'Genesee',
 'Georgetown': 'Georgetown',
 'Green Lake': 'Green Lake',
 'Haller Lake': 'Haller Lake',
 'Harbor Island': 'Harbor Island',
 'Harrison/Denny-Blaine': 'Harrison / Denny Blaine',
 'High Point': 'High Point',
 'Highland Park': 'Highland Park',
 'Holly Park': 'Beacon Hill',
 'Industrial District': 'Industrial District',
 'Interbay': 'Interbay',
 'International District': 'International District',
 'Laurelhurst': 'Laurelhurst / Sand Point',
 'Sand Point': 'Laurelhurst / Sand Point',
 'Lawton Park': 'Magnolia',
 'Leschi': 'Leschi',
 'Lower Queen Anne': 'Lower Queen Anne',
 'Loyal Heights': 'Sunset Hill',
 'Madison Park': 'Madison Park',
 'Madrona': 'Madrona',
 'Mann': 'Mann',
 'Maple Leaf': 'Northgate',
 'Matthews Beach': 'Wedgeview Ridge',
 'Mid-Beacon Hill': 'Beacon Hill',
 'Minor': 'Minor',
 'Montlake': 'Montlake',
 'Mount Baker': 'Mount Baker',
 'North Admiral': 'North Admiral',
 'North Beach/Blue Ridge': 'North Beach Blue Ridge',
 'North Beacon Hill': 'North Beacon Hill',
 'North College Park': 'Licton Springs',
 'North Delridge': 'North Delridge',
 'North Queen Anne': 'North Queen Anne',
 'Olympic Hills': 'Olympic Hills / Victory Heights',
 'Victory Heights': 'Olympic Hills / Victory Heights',
 'Phinney Ridge': 'Phinney Ridge / Greenwood',
 'Greenwood': 'Phinney Ridge / Greenwood',
 'Pike-Market': 'Pike Market',
 'Pinehurst': 'Northgate',
 'Pioneer Square': 'Pioneer Square',
 'Portage Bay': 'Portage Bay',
 'Rainier Beach': 'Rainier Beach',
 'Rainier View': 'Rainier Beach',
 'Ravenna': 'Ravenna / Bryant',
 'Bryant': 'Ravenna / Bryant',
 'Riverview': 'Riverview',
 'Roosevelt': 'Ravenna / Bryant',
 'Roxhill': 'Roxhill',
 'Seaview': 'Seaview',
 'Sunset Hill': 'Sunset Hill',
 'Seward Park': 'Seward Park',
 'South Beacon Hill': 'South Beacon Hill',
 'South Delridge': 'South Delridge',
 'South Lake Union': 'South Lake Union',
 'South Park': 'South Park',
 'Southeast Magnolia': 'Magnolia',
 'Stevens': 'Stevens (Capitol Hill)',
 'University District': 'University District',
 'View Ridge': 'Laurelhurst / Sand Point',
 'Wallingford': 'Wallingford',
 'Wedgwood': 'Wedgeview Ridge',
 'West Queen Anne': 'West Queen Anne',
 'West Woodland': 'Ballard',
 'Westlake': 'Westlake',
 'Whittier Heights': 'Whittier Heights',
 'Windermere': 'Laurelhurst / Sand Point',
 'Yesler Terrace': 'Yesler Terrace'}

In [2]:
#df=pd.read_csv('data/data_active_clean.csv')

In [None]:
# add street_neighborhood column based on city_neighborhood list
df['street_neighborhood']=df['city_neighborhood'].apply(lambda x : city_street_dict[x])

In [None]:
#add listing urls
df['url']=df['id'].map(lambda x: 'https://www.flyhomes.com/listings/{}'.format(x),
               na_action='ignore')

In [4]:
# add 'type coulmn"
df['proptype']=df['prop_type']
df[df['mls_prop_type']=='32 - Townhouse']=\
                                df[df['mls_prop_type']=='32 - Townhouse'].replace({'proptype':{'Residential':'Townhouse'}})

In [11]:
#df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df=df.reset_index()
df.drop(['status','index'],axis=1,inplace=True)

In [13]:
df.to_csv('data/data_active_clean.csv')