In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import numpy as np
from altair import *

In [2]:
sf = pd.read_csv('data/sf/v1 073117/SFLicenseData.csv',low_memory=False)
sf = sf.drop_duplicates(subset=['DBA Name','Business Location'])
sf = sf.dropna(subset=['DBA Name','Business Location'])
sf = sf[(sf['Ownership Name'].str.contains('Coffee|Roaster')) 
    | (sf['DBA Name'].str.contains('Coffee|Roaster'))]
sf = sf[(pd.DatetimeIndex(sf['Business End Date']) >= '1/1/2017') 
    | sf['Business End Date'].isnull()==True]
sf = sf[['DBA Name', 'Street Address', 'City', 'State', 'Source Zipcode',
         'Neighborhoods - Analysis Boundaries','Business Location']]

businessLoc = sf['Business Location'].str.extract('\((.*)\)',expand=False)
businessCoords = businessLoc.str.split(', ',expand=True).astype(float)
businessCoords.columns = ['latitude','longitude']

sf = pd.concat([sf,businessCoords],axis=1)
sf = sf.dropna(subset=['Street Address','longitude','latitude'])

# change into geodataframe
sf['geometry'] = sf.apply(lambda z: Point( z.longitude,z.latitude), axis=1)
sf = gpd.GeoDataFrame(sf,crs = {'init': 'epsg:4326'})
sf = sf[['DBA Name', 'Street Address', 'City', 'State', 'Source Zipcode',
         'Neighborhoods - Analysis Boundaries','geometry']]

sf.head()

Unnamed: 0,DBA Name,Street Address,City,State,Source Zipcode,Neighborhoods - Analysis Boundaries,geometry
75,Bechellis Coffee Shop,2346 Chestnut St,San Francisco,CA,94123.0,Marina,POINT (-122.441968 37.800087)
338,Ocean Beach Coffee Shop,2898 Sloat Blvd,San Francisco,CA,94116.0,Sunset/Parkside,POINT (-122.505502 37.735572)
507,Europa Malvina Coffee Co,1411 Minnesota St,San Francisco,CA,94107.0,Bayview Hunters Point,POINT (-122.389662 37.752289)
1689,Flute Coffee And Wine Bar,750 Kearny St,San Francisco,CA,94108.0,Chinatown,POINT (-122.40491 37.795071)
3701,San Francisco Coffee Co,12 Sherman St,San Francisco,CA,94103.0,South of Market,POINT (-122.406695 37.777142)


In [3]:
sfBus = pd.read_csv('data/sf/v1 073117/stops.txt')
sfBus = sfBus[['stop_lat','stop_lon','stop_name']].drop_duplicates(subset=['stop_lat','stop_lon','stop_name'])
sfBus['geometry'] = sfBus.apply(lambda z: Point( z.stop_lon,z.stop_lat), axis=1)
sfBus = gpd.GeoDataFrame(sfBus,crs = {'init': 'epsg:4326'})
sfBus.head()

Unnamed: 0,stop_lat,stop_lon,stop_name,geometry
0,37.792357,-122.42101,Clay St & Polk St,POINT (-122.42101 37.792357)
1,37.793826,-122.409591,Clay St & Powell St,POINT (-122.409591 37.793826)
2,37.793653,-122.410823,Clay St & Mason St,POINT (-122.410823 37.79365300000001)
3,37.794682,-122.40277,Clay St & Montgomery St,POINT (-122.40277 37.794682)
4,37.792526,-122.419589,Clay St & Larkin St,POINT (-122.419589 37.792526)


In [4]:
parks = gpd.read_file('data/sf/v1 073117/SF_parks/geo_export_b3a96247-6dcf-4d25-9b67-80296dc300c2.shp')
parks['geometry'] = parks.apply(lambda z: Point( z.x,z.y), axis=1)
parks = parks.rename(columns={'map_park_n':'map_park_Name'},)
parks.head()

Unnamed: 0,acres,geometry,gis_fd_pk,map_park_Name,perimeter,sqft,x,y
0,2.012762,POINT (-122.39923889 37.79560513),142.0,Maritime Plaza,3208.814853,87675.92,-122.399239,37.795605
1,2.51593,POINT (-122.40595532 37.77700947),169.0,Victoria Manalo Draves Park,1498.021103,109593.9,-122.405955,37.777009
2,56.460314,POINT (-122.42923866 37.7140459),164.0,Crocker Amazon Playground,6738.886428,2459411.0,-122.429239,37.714046
3,0.15427,POINT (-122.42274353 37.76237833),137654.0,Dearborn Community Garden,338.000133,6720.006,-122.422744,37.762378
4,6.471178,POINT (-122.45745402 37.77874186),105.0,Angelo J. Rossi Playground,2609.521321,281884.5,-122.457454,37.778742


In [5]:
def getNearestPoint(pt,searchPts):
    nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
    nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
    name = nearest[nearest.columns[nearest.columns.str.contains('name|NAME|Name')]]
    x =name.iloc[0][0]
    y = nearest.geometry.distance(pt)
    if y.iloc[0] == 0.0:
        searchPts = searchPts.loc[[i for i in searchPts.index if i != nearest.head(1).index ],:]
        nearest = nearest_points(pt,searchPts.geometry.unary_union)[1]
        nearest = gpd.GeoDataFrame(searchPts[searchPts.geometry == nearest])
        name = nearest[nearest.columns[nearest.columns.str.contains('name|NAME|Name')]]
        x =name.iloc[0][0]
        y = nearest.geometry.distance(pt)
    try:
        y = float(y)
    except:
        y = 0.0
    return x,y

In [6]:
coffeeExtract = sf['geometry'].apply(lambda x: getNearestPoint(x,sf))
sf['nearestCoffeeDesc'] = coffeeExtract.apply(lambda x: x[0])
sf['nearestCoffeeDist'] = coffeeExtract.apply(lambda x: x[1])

In [7]:
busExtract = sf['geometry'].apply(lambda x: getNearestPoint(x,sfBus))
sf['nearestBusStopDesc'] = busExtract.apply(lambda x: x[0])
sf['nearestBusStopDist'] = busExtract.apply(lambda x: x[1])

In [8]:
parkExtract = sf['geometry'].apply(lambda x: getNearestPoint(x,parks))
sf['nearestParkDesc'] = parkExtract.apply(lambda x: x[0])
sf['nearestParkDist'] = parkExtract.apply(lambda x: x[1])

In [9]:
neighborhoods = gpd.read_file('data/sf/v1 073117/Planning Neighborhood Groups Map/geo_export_bb4050c9-e7df-47b5-8626-2e7d1b342e50.shp')
neighborhoods['area'] = neighborhoods['geometry'].area

In [10]:
joinedData = pd.DataFrame(gpd.tools.sjoin(neighborhoods,sf,op='contains',how='inner'))

In [11]:
joinedData['sbInd'] = joinedData['DBA Name'].str.contains('Starbuck')
joinedData = joinedData.reset_index()

In [12]:
joinedData.head()

Unnamed: 0,index,geometry,neighborho,area,index_right,DBA Name,Street Address,City,State,Source Zipcode,Neighborhoods - Analysis Boundaries,nearestCoffeeDesc,nearestCoffeeDist,nearestBusStopDesc,nearestBusStopDist,nearestParkDesc,nearestParkDist,sbInd
0,0,POLYGON ((-122.4840890111613 37.78791033989035...,Seacliff,0.000191,88949,Wrecking Ball Coffee Roasters,426 45th Ave,San Francisco,CA,94121.0,Outer Richmond,Kalita Usa,0.0,Point Lobos Ave & 46th Ave,0.001234,Balboa Natural Area,0.006586,False
1,0,POLYGON ((-122.4840890111613 37.78791033989035...,Seacliff,0.000191,88950,Kalita Usa,426 45th Ave,San Francisco,CA,94121.0,Outer Richmond,Kalita Usa,0.0,Point Lobos Ave & 46th Ave,0.001234,Balboa Natural Area,0.006586,False
2,0,POLYGON ((-122.4840890111613 37.78791033989035...,Seacliff,0.000191,131687,Lucky Coffee,4150 Clement St 200,San Francisco,CA,94121.0,Lincoln Park,Wrecking Ball Coffee Roasters,0.0,43rd Ave & Clement St,0.00034,Lincoln Park,0.006345,False
3,1,POLYGON ((-122.4359639472376 37.76903838537972...,Haight Ashbury,0.000202,20135,Peet's Coffee & Tea,310 Broderick St,San Francisco,CA,94117.0,Hayes Valley,Repose Coffee Bar And Gallery,0.002231,Divisadero St & Oak St,0.001632,Buena Vista Park,0.005,False
4,1,POLYGON ((-122.4359639472376 37.76903838537972...,Haight Ashbury,0.000202,135811,Ritual Coffee Roasters,1300 Haight St,San Francisco,CA,94117.0,,Coffee To The People,0.001406,Masonic Ave & Haight St,0.001348,Buena Vista Park,0.003151,False


In [13]:
aggFuncs = {'index':'count','nearestBusStopDist':'mean','nearestCoffeeDist':'mean','nearestParkDist':'mean'}

In [14]:
sb = joinedData[joinedData['sbInd']==True].groupby('neighborho').agg(aggFuncs).reset_index()
not_sb = joinedData[joinedData['sbInd']==False].groupby('neighborho').agg(aggFuncs).reset_index()

nsbRatios = sb.merge(not_sb,how='left',on='neighborho',suffixes=['_sb','_nsb'])
nsbRatios = nsbRatios.fillna(0)
nsbRatios['nsbRatio'] = nsbRatios['index_nsb']/nsbRatios['index_sb']

In [15]:
nsbRatios.sort_values('nsbRatio',ascending=False)

Unnamed: 0,neighborho,nearestCoffeeDist_sb,nearestBusStopDist_sb,index_sb,nearestParkDist_sb,nearestCoffeeDist_nsb,nearestBusStopDist_nsb,index_nsb,nearestParkDist_nsb,nsbRatio
8,Mission,0.0,0.000279,1,0.002354,0.001306,0.001114,26,0.002771,26.0
0,Bayview,0.002994,0.000314,1,0.004877,0.002023,0.001146,13,0.004976,13.0
12,Outer Sunset,0.0,0.000134,1,0.006488,0.002817,0.001446,10,0.007005,10.0
10,Noe Valley,0.001651,7.1e-05,1,0.005723,0.00207,0.001458,8,0.004822,8.0
19,Western Addition,0.002543,0.000241,2,0.002511,0.001197,0.000679,16,0.001913,8.0
4,Inner Richmond,0.00172,0.000766,2,0.004901,0.00137,0.000603,15,0.00312,7.5
1,Castro/Upper Market,0.00106,0.000266,2,0.003271,0.001315,0.000503,14,0.002343,7.0
14,Potrero Hill,0.00058,0.000342,2,0.002772,0.000822,0.001058,13,0.00341,6.5
9,Nob Hill,0.001367,0.000443,1,0.005721,0.001572,0.000385,6,0.003569,6.0
15,Presidio Heights,0.002817,0.000103,1,0.003383,0.001148,0.00021,6,0.002593,6.0


In [16]:
nsbRatios.sort_values('nsbRatio',ascending=False).head(10)

Unnamed: 0,neighborho,nearestCoffeeDist_sb,nearestBusStopDist_sb,index_sb,nearestParkDist_sb,nearestCoffeeDist_nsb,nearestBusStopDist_nsb,index_nsb,nearestParkDist_nsb,nsbRatio
8,Mission,0.0,0.000279,1,0.002354,0.001306,0.001114,26,0.002771,26.0
0,Bayview,0.002994,0.000314,1,0.004877,0.002023,0.001146,13,0.004976,13.0
12,Outer Sunset,0.0,0.000134,1,0.006488,0.002817,0.001446,10,0.007005,10.0
10,Noe Valley,0.001651,7.1e-05,1,0.005723,0.00207,0.001458,8,0.004822,8.0
19,Western Addition,0.002543,0.000241,2,0.002511,0.001197,0.000679,16,0.001913,8.0
4,Inner Richmond,0.00172,0.000766,2,0.004901,0.00137,0.000603,15,0.00312,7.5
1,Castro/Upper Market,0.00106,0.000266,2,0.003271,0.001315,0.000503,14,0.002343,7.0
14,Potrero Hill,0.00058,0.000342,2,0.002772,0.000822,0.001058,13,0.00341,6.5
9,Nob Hill,0.001367,0.000443,1,0.005721,0.001572,0.000385,6,0.003569,6.0
15,Presidio Heights,0.002817,0.000103,1,0.003383,0.001148,0.00021,6,0.002593,6.0


In [17]:
def getSBDensity(x):
    if x.index_sb == 0:
        return 0.0
    else:
        return x.area / x.index_sb
        
def getNSBDensity(x):
    if x.index_nsb ==0:
        return 0.0
    else:
        return x.area / x.index_nsb
        
def getCoffeeDensity(x):
    if (x.index_nsb + x.index_sb ) ==0:
        return 0.0
    else:
        return x.area / (x.index_nsb + x.index_sb)

In [18]:
nsbRatios = nsbRatios.merge(neighborhoods[['neighborho','area']])
nsbRatios['areaPerSB'] = nsbRatios.apply(getSBDensity,axis=1)
nsbRatios['areaPerNSB'] = nsbRatios.apply(getNSBDensity,axis=1)
nsbRatios['areaPerCoffee'] = nsbRatios.apply(getCoffeeDensity,axis=1)
nsbRatios.head()

Unnamed: 0,neighborho,nearestCoffeeDist_sb,nearestBusStopDist_sb,index_sb,nearestParkDist_sb,nearestCoffeeDist_nsb,nearestBusStopDist_nsb,index_nsb,nearestParkDist_nsb,nsbRatio,area,areaPerSB,areaPerNSB,areaPerCoffee
0,Bayview,0.002994,0.000314,1,0.004877,0.002023,0.001146,13,0.004976,13.0,0.001296,0.001296,0.0001,9.3e-05
1,Castro/Upper Market,0.00106,0.000266,2,0.003271,0.001315,0.000503,14,0.002343,7.0,0.000227,0.000113,1.6e-05,1.4e-05
2,Downtown/Civic Center,0.000949,0.000368,8,0.002273,0.000704,0.000401,22,0.001886,2.75,0.000171,2.1e-05,8e-06,6e-06
3,Financial District,0.000713,0.000457,34,0.003505,0.000555,0.000531,45,0.004371,1.323529,0.000184,5e-06,4e-06,2e-06
4,Inner Richmond,0.00172,0.000766,2,0.004901,0.00137,0.000603,15,0.00312,7.5,0.000348,0.000174,2.3e-05,2e-05


In [19]:
Chart(nsbRatios[['nsbRatio']]).mark_bar().encode(
    x=X('nsbRatio',
        bin=Bin(maxbins=10,),
       ),
    y='count(*)',
)

You can access infer_dtype as pandas.api.types.infer_dtype
  typ = pd.lib.infer_dtype(data)


In [20]:
#nsbRatios.to_csv('sf_ratios_081117.csv',index=False)

In [21]:
neighborhoods

Unnamed: 0,geometry,neighborho,area
0,POLYGON ((-122.4840890111613 37.78791033989035...,Seacliff,0.000191
1,POLYGON ((-122.4359639472376 37.76903838537972...,Haight Ashbury,0.000202
2,POLYGON ((-122.4542835530925 37.70822211924219...,Outer Mission,0.000364
3,"POLYGON ((-122.420951677893 37.80896653451985,...",Russian Hill,0.000126
4,POLYGON ((-122.4255779199574 37.75661672602673...,Noe Valley,0.000237
5,POLYGON ((-122.4514357703022 37.75873500221411...,Inner Sunset,0.000354
6,POLYGON ((-122.4089123326097 37.79013165216868...,Downtown/Civic Center,0.000171
7,POLYGON ((-122.4355269224704 37.74145993624912...,Diamond Heights,9.1e-05
8,POLYGON ((-122.3731329665094 37.83225336477149...,Treasure Island/YBI,0.000235
9,POLYGON ((-122.4710508877837 37.70819787114457...,Lakeshore,0.000965
