In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import numpy as np
from altair import *

In [2]:
chi = pd.read_csv('data/chicago/v2 07012017/ChicagoLicenseData.csv',low_memory=False)
chi = chi[(chi['LEGAL NAME'].str.contains('COFFEE|ROASTER')) |
          (chi['DOING BUSINESS AS NAME'].str.contains('COFFEE|ROASTER'))]
chi = chi[pd.DatetimeIndex(chi['LICENSE TERM EXPIRATION DATE']) >= '1/1/2017']
chi = chi.drop_duplicates(subset=['LOCATION'])
chi = chi.dropna(subset=['LOCATION'])
chi = chi[['LEGAL NAME','DOING BUSINESS AS NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP CODE','LATITUDE', 'LONGITUDE', 'LOCATION']]

# change into geodataframe
chi['geometry'] = chi.apply(lambda z: Point( z.LONGITUDE,z.LATITUDE), axis=1)
chi = gpd.GeoDataFrame(chi,crs = {'init': 'epsg:4326'})

In [3]:
ctaTrain = pd.read_csv('data/chicago/v2 07012017/trainStopsCTA.csv')
ctaTrain = ctaTrain[['Location','STATION_DESCRIPTIVE_NAME']].drop_duplicates(subset=['STATION_DESCRIPTIVE_NAME'])
ctaTrain[['LATITUDE','LONGITUDE']] = ctaTrain['Location'].str[1:-1].str.split(', ',expand=True).astype(float)
ctaTrain['geometry'] = ctaTrain.apply(lambda z: Point( z.LONGITUDE,z.LATITUDE), axis=1)
ctaTrain = gpd.GeoDataFrame(ctaTrain,crs = {'init': 'epsg:4326'})

In [4]:
ctaBus = gpd.read_file('data/chicago/v2 07012017/CTABusStops/CleanBusStops.shp',)

In [5]:
def getNearestPoint(pt,searchPts):
    nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
    nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
    name = nearest[nearest.columns[nearest.columns.str.contains('name|NAME|Name')]]
    x =name.iloc[0][0]
    y = nearest.geometry.distance(pt)
    if y.iloc[0] == 0.0:
        searchPts = searchPts.loc[[i for i in searchPts.index if i != nearest.index ],:]
        nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
        nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
        name = nearest[nearest.columns[nearest.columns.str.contains('name|NAME|Name')]]
        x =name.iloc[0][0]
        y = nearest.geometry.distance(pt)
    try:
        y = float(y)
    except:
        y = 0.0
    return x,y

In [6]:
coffeeExtract = chi['geometry'].apply(lambda x: getNearestPoint(x,chi))
chi['nearestCoffeeDesc'] = coffeeExtract.apply(lambda x: x[0])
chi['nearestCoffeeDist'] = coffeeExtract.apply(lambda x: x[1])

In [7]:
trainExtract = chi['geometry'].apply(lambda x: getNearestPoint(x,ctaTrain))
chi['nearestTrainStopDesc'] = trainExtract.apply(lambda x: x[0])
chi['nearestTrainStopDist'] = trainExtract.apply(lambda x: x[1])

In [8]:
busExtract = chi['geometry'].apply(lambda x: getNearestPoint(x,ctaBus))
chi['nearestBusStopDesc'] = busExtract.apply(lambda x: x[0])
chi['nearestBusStopDist'] = busExtract.apply(lambda x: x[1])

In [9]:
neighborhoods = gpd.read_file('data/chicago/v2 07012017/ChicagoNeighborhoods.shp')

In [10]:
joinedData = pd.DataFrame(gpd.tools.sjoin(neighborhoods,chi,op='contains',how='inner'))
joinedData = joinedData[['pri_neigh','DOING BUSINESS AS NAME', 
            'ADDRESS','CITY', 'STATE', 'ZIP CODE',
            'nearestTrainStopDesc','nearestTrainStopDist','nearestBusStopDesc','nearestBusStopDist',
            'nearestCoffeeDesc','nearestCoffeeDist'
                        ]].reset_index(drop=True)

In [11]:
joinedData['sbInd'] = joinedData['DOING BUSINESS AS NAME'].str.contains('STARBUCK')
joinedData['nearestTrainStopDesc'] = joinedData['nearestTrainStopDesc'].replace(to_replace=' \(.*\)',value='',regex=True)
joinedData = joinedData.reset_index()

In [12]:
joinedData.head()

Unnamed: 0,index,pri_neigh,DOING BUSINESS AS NAME,ADDRESS,CITY,STATE,ZIP CODE,nearestTrainStopDesc,nearestTrainStopDist,nearestBusStopDesc,nearestBusStopDist,nearestCoffeeDesc,nearestCoffeeDist,sbInd
0,0,Grand Boulevard,ACE COFFEE BAR INC.,5001 S MICHIGAN AVE 1,CHICAGO,IL,60615,51st,0.004377,51st Street & Michigan,0.001629,DOLLOP HP LLC,0.025786,False
1,1,Printers Row,PRINTER'S ROW COFFEE COMPANY LLC,600 S DEARBORN ST 7TH 712,CHICAGO,IL,60605,Harrison,0.001929,Dearborn & Harrison,0.000346,STARBUCKS CORPORATION,0.000343,False
2,2,United Center,METRIC COFFEE CAFE,2021 W FULTON ST K101B,CHICAGO,IL,60612,Ashland,0.010789,Damen & Fulton,0.000845,INTELLIGENTSIA COFFEE INC.,0.003756,False
3,3,United Center,INTELLIGENTSIA COFFEE,1850 W FULTON ST,CHICAGO,IL,60612,Ashland,0.00715,Damen & Fulton,0.002791,"BOW & TRUSS, LLC",0.00315,False
4,4,United Center,BIG SHOULDERS COFFEE,324 N LEAVITT ST,CHICAGO,IL,60612,Western,0.01365,Grand & Leavitt,0.003259,Metric west fulton shop inc,0.004253,False


In [13]:
# aggFuncs = {'index':'count','nearestTrainStopDist':'mean','nearestBusStopDist':'mean','nearestCoffeeDist':'mean'}
aggFuncs = {'index':'count','nearestTrainStopDist':'mean','nearestCoffeeDist':'mean'}

In [14]:
sb = joinedData[joinedData['sbInd']==True].groupby('pri_neigh').agg(aggFuncs).reset_index()
not_sb = joinedData[joinedData['sbInd']==False].groupby('pri_neigh').agg(aggFuncs).reset_index()

nsbRatios = sb.merge(not_sb,how='left',on='pri_neigh',suffixes=['_sb','_nsb'])
nsbRatios = nsbRatios.fillna(0)
nsbRatios['nsbRatio'] = nsbRatios['index_nsb']/nsbRatios['index_sb']

In [15]:
nsbRatios.sort_values('nsbRatio',ascending=False).head(10)

Unnamed: 0,pri_neigh,index_sb,nearestTrainStopDist_sb,nearestCoffeeDist_sb,index_nsb,nearestTrainStopDist_nsb,nearestCoffeeDist_nsb,nsbRatio
17,Logan Square,1,0.0,0.004167,8.0,0.002107,0.004094,8.0
34,West Town,1,0.00548,0.000829,7.0,0.008623,0.004246,7.0
8,Edgewater,1,0.001554,0.010704,4.0,0.004842,0.002762,4.0
16,Lincoln Square,2,0.002089,0.004568,6.0,0.005074,0.003956,3.0
35,Wicker Park,1,0.00054,0.000853,3.0,0.005186,0.004183,3.0
13,Hyde Park,2,0.021987,0.005777,5.0,0.014643,0.001271,2.5
4,Boystown,1,0.009095,0.001523,2.0,0.007175,0.002895,2.0
7,East Village,1,0.003473,0.002072,2.0,0.005347,0.002512,2.0
21,North Center,3,0.007262,0.002774,6.0,0.005573,0.002226,2.0
31,Uptown,2,0.002389,0.003729,3.0,0.003522,0.003973,1.5


In [16]:
Chart(nsbRatios[['nsbRatio']]).mark_bar().encode(
    x=X('nsbRatio',
        bin=Bin(maxbins=5.0,),
       ),
    y='count(*)',
)

You can access infer_dtype as pandas.api.types.infer_dtype
  typ = pd.lib.infer_dtype(data)


In [18]:
nsbRatios.to_csv('chicago_ratios_081117.csv',index=False)