In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
from sklearn.neighbors import NearestNeighbors

neighborhoods = gpd.read_file('data/chicago/v3 08192017/ChicagoNeighborhoods.shp')
transit = gpd.read_file('data/chicago/v3 08192017/transit_upd081917.shp')
coffee = gpd.read_file('data/chicago/v3 08192017/coffee_upd081917.shp')
apts = gpd.read_file('data/chicago/v3 08192017/clApts_upd081917.shp')

In [2]:
### setting up variables and prelim processing
home = Point(-87.659349,41.9880054)
work = Point(-87.6438878,41.884193)
apts = apts[apts['bedroom'].between(1.0,2.0)]

In [3]:
# set up functions
def getNearestPoint(pt,searchPts):
    # get nearest point relative to a given point
    pt = pt['geometry']
    nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
    nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
    y = nearest.geometry.distance(pt)
    if y.iloc[0] == 0.0:
        searchPts = searchPts.loc[[i for i in searchPts.index if i != nearest.index[0] ],:]
        nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
        nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
        y = nearest.geometry.distance(pt)
    return float(y.iloc[0])

In [4]:
# clean and the business data
coffee.dropna(subset=['LONGITUDE'],inplace=True)
coffee = coffee.drop_duplicates(subset=['LONGITUDE','LATITUDE'])
coffee = coffee[coffee['DOING BUSI'].str.contains('STARBUCK') == False]

# get distance from home
coffee['homeDist'] = coffee['geometry'].apply(lambda x: home.distance(x))
coffee['workDist'] = coffee['geometry'].apply(lambda x: work.distance(x))

# get nearest other coffee place and train CTA train stop
coffee['nearestCoffeeDist'] = coffee.apply(lambda x: getNearestPoint(x,coffee),axis=1)
coffee['nearestTransitDist'] = coffee.apply(lambda x: getNearestPoint(x,transit),axis=1)

In [5]:
# get designated neighborhood and reduce columns
neighborhoods['area'] = neighborhoods['geometry'].area
neighborhoods.rename(columns={'pri_neigh':'neighborhood'},inplace=True)

joinedData = gpd.tools.sjoin(coffee,neighborhoods,op='within',how='inner')
joinedData = joinedData[['neighborhood','DOING BUSI', 
            'ADDRESS','CITY', 'STATE', 'ZIP CODE',
            'LONGITUDE','LATITUDE',
            'nearestTransitDist','workDist','homeDist','nearestCoffeeDist','geometry'
            ]].reset_index(drop=True)
joinedData.reset_index(inplace=True)

In [6]:
# get median price of 30 closest rentals
dists = joinedData['geometry'].apply(lambda nsb:
    apts['geometry'].apply(lambda apt: apt.distance(nsb)).sort_values().head(30).index
                        )
dists = pd.DataFrame(dists)
joinedData['medianPrice'] = dists['geometry'].apply(lambda x: apts.loc[x]['price'].median())
joinedData.drop('index',inplace=True,axis=1)

In [7]:
joinedData.head()

Unnamed: 0,neighborhood,DOING BUSI,ADDRESS,CITY,STATE,ZIP CODE,LONGITUDE,LATITUDE,nearestTransitDist,workDist,homeDist,nearestCoffeeDist,geometry,medianPrice
0,Old Town,"EVA'S COFFEE, INC.",1447 N SEDGWICK ST 1ST,CHICAGO,IL,60610,-87.638375,41.908857,0.001808,0.025272,0.08188,0.014932,POINT (-87.63837516299999 41.908856726),2338.5
1,West Loop,ARTURO EXPRESS,130 S CANAL ST,CHICAGO,IL,60606,-87.639765,41.879616,0.004256,0.00616,0.110144,0.001574,POINT (-87.639764523 41.879616268),2117.5
2,West Loop,MEDDLE COFFEE BAR,601 W JACKSON BLVD 1 A,CHICAGO,IL,60661,-87.642609,41.877889,0.002857,0.006433,0.111382,0.003328,POINT (-87.642609175 41.877888529),2653.0
3,West Loop,PEET'S COFFEE & TEA,222 S RIVERSIDE PLZ 1ST,CHICAGO,IL,60606,-87.638579,41.878582,0.003879,0.007725,0.111378,0.001574,POINT (-87.63857866799999 41.878581561),2117.5
4,West Loop,GROUNDSWELL COFFEE ROASTERS,1168 W MADISON ST 1ST 2,CHICAGO,IL,60607,-87.656987,41.881729,0.006153,0.013329,0.106303,0.003664,POINT (-87.65698670899999 41.881728772),2700.0


At this point I saved the data as it is processed now and manually categorized locations as visited or not visited. After that, I reloaded the data.

In [8]:
# writer = pd.ExcelWriter('./NSB_preprocessed_upd082317.xlsx')
# pd.DataFrame(joinedData.drop('geometry',axis=1)).to_excel(writer,index=False)
# writer.close()

In [9]:
NSBData_processed = pd.read_excel('NSB_preprocessed_upd082317.xlsx')
NSBData_processed = NSBData_processed[NSBData_processed['visitInd']==False]
NSBData_processed.head()

Unnamed: 0,neighborhood,visitInd,DOING BUSI,ADDRESS,CITY,STATE,ZIP CODE,LONGITUDE,LATITUDE,nearestTransitDist,workDist,homeDist,nearestCoffeeDist,medianPrice
0,Little Village,False,ACE COFFEE BAR,2650 S CALIFORNIA AVE 2ND,CHICAGO,IL,60608,-87.695309,41.842996,0.011126,0.065889,0.149402,0.024981,935.0
1,Grand Boulevard,False,ACE COFFEE BAR INC.,5001 S MICHIGAN AVE 1,CHICAGO,IL,60615,-87.622545,41.803731,0.004377,0.083244,0.187914,0.025786,1299.5
2,West Loop,False,"ACE COFFEE BAR, INC.",120 N SANGAMON ST,CHICAGO,IL,60607,-87.651067,41.883715,0.002184,0.007195,0.104619,0.00607,3034.0
3,Archer Heights,False,"ACE COFFEE BAR, INC.",3642 W 47TH ST,CHICAGO,IL,60632,-87.715395,41.808085,0.011644,0.10443,0.188447,0.029281,882.5
4,"Little Italy, UIC",False,Ace ICRE Roosevelt,1950 W ROOSEVELT RD BASEMENT,CHICAGO,IL,60608,-87.675813,41.866871,0.007835,0.036322,0.122248,0.004607,1800.0


In [10]:
# split groups into places that are closer to home, or work
NSBData_processed['closerTo'] = NSBData_processed.apply(lambda x: 'home' if x.workDist > x.homeDist else 'work',axis=1)
closerToHome = NSBData_processed[NSBData_processed['closerTo']=='home']
closerToWork = NSBData_processed[NSBData_processed['closerTo']=='work']

closerToHomePoints = closerToHome[['LONGITUDE','LATITUDE']]
closerToWorkPoints = closerToWork[['LONGITUDE','LATITUDE']]

In [11]:
neighborsWork = NearestNeighbors(n_neighbors=3).fit(closerToWorkPoints)
distancesWork,clustersWork = neighborsWork.kneighbors(closerToWorkPoints)
closerToWork.iloc[clustersWork[0]]

Unnamed: 0,neighborhood,visitInd,DOING BUSI,ADDRESS,CITY,STATE,ZIP CODE,LONGITUDE,LATITUDE,nearestTransitDist,workDist,homeDist,nearestCoffeeDist,medianPrice,closerTo
0,Little Village,False,ACE COFFEE BAR,2650 S CALIFORNIA AVE 2ND,CHICAGO,IL,60608,-87.695309,41.842996,0.011126,0.065889,0.149402,0.024981,935.0,work
68,"Little Italy, UIC",False,HOPE COFFEEHOUSE,2431 W ROOSEVELT RD 1ST,CHICAGO,IL,60608,-87.68683,41.866494,0.009127,0.046446,0.12458,0.011023,1625.0,work
99,Lower West Side,False,NITECAP COFFEE BAR LLC,1738 W 18TH ST 1 1,CHICAGO,IL,60608,-87.670284,41.857846,0.001138,0.037295,0.130618,0.002397,1872.5,work


In [12]:
neighborsHome = NearestNeighbors(n_neighbors=3).fit(closerToHomePoints)
distancesHome,clustersHome = neighborsHome.kneighbors(closerToHomePoints)
closerToHome.iloc[clustersHome[0]]

Unnamed: 0,neighborhood,visitInd,DOING BUSI,ADDRESS,CITY,STATE,ZIP CODE,LONGITUDE,LATITUDE,nearestTransitDist,workDist,homeDist,nearestCoffeeDist,medianPrice,closerTo
6,West Ridge,False,ADRIANA COFFEE SHOP,6345 N CALIFORNIA AVE 1ST,CHICAGO,IL,60659,-87.699476,41.997086,0.031115,0.125836,0.041141,0.013344,1250.0,home
34,West Ridge,False,CAFE ZIPO,5645 N LINCOLN AVE 1ST,CHICAGO,IL,60659,-87.696492,41.98408,0.018123,0.112892,0.03735,0.008693,1272.5,home
13,North Park,False,BABA'S COFFEE,5544-5546 N KEDZIE AVE 1,CHICAGO,IL,60625,-87.708996,41.982582,0.015234,0.117981,0.049942,0.011598,1450.0,home
