In [27]:
import pandas as pd
import pickle
from haversine import haversine

In [28]:
with open('foursquare_POIs_only.pickle', 'rb') as f:
    fsq_df = pickle.load(f)

In [29]:
fsq_df = fsq_df[['venueId','venueCategory', 'latitude', 'longitude']].rename(columns={'latitude':'poi_lat', 'longitude':'poi_lon'})

In [30]:
fsq_df.shape

(7523, 4)

In [31]:
#reducing observations to unique venues/POIs for distance calculation
fsq_df_unique = fsq_df.groupby(by='venueId', as_index=False).max()
fsq_df_unique.shape

(2553, 4)

In [32]:
with open('Benson_full_analysis.pickle', 'rb') as f:
    entry_df = pickle.load(f)

In [33]:
entry_df = entry_df.sort_values('ENTRIES', ascending = False)

#only interested in top 10 stations
entry_df = entry_df.iloc[0:10]

In [34]:
#only interested in the station names and orders
distance_df = pd.DataFrame(entry_df.STATION)
distance_df.rename(columns={'STATION': 'station'}, inplace=True)

In [35]:
#manually update latitude/longitude for top 10
stat_lat_lons =[(40.750497,-73.990877), #34 st- penn station
                (40.752962,-73.977219), # grand central - 42 st
                (40.750326,-73.988067), # 34 st - Herald Square
                (40.744376,-73.995652), #23 st - assuming 7th Ave and 23rd st 
                (40.735101,-73.990758), #14th st- union square
                (40.758092,-73.991457), # 42 st - port authority
                (40.709609,-74.008294), # Fulton
                (40.755453,-73.987285), # Times Square - 42nd st
                (40.789614,-73.976299), # 86 St
                (40.804609,-73.937456) #125 st
]

In [36]:
distance_df['station_lat'] = [x[0] for x in stat_lat_lons]
distance_df['station_lon'] = [x[1] for x in stat_lat_lons]
distance_df

Unnamed: 0,station,station_lat,station_lon
60,34 ST-PENN STA,40.750497,-73.990877
232,GRD CNTRL-42 ST,40.752962,-73.977219
58,34 ST-HERALD SQ,40.750326,-73.988067
45,23 ST,40.744376,-73.995652
14,14 ST-UNION SQ,40.735101,-73.990758
67,42 ST-PORT AUTH,40.758092,-73.991457
225,FULTON ST,40.709609,-74.008294
352,TIMES SQ-42 ST,40.755453,-73.987285
109,86 ST,40.789614,-73.976299
9,125 ST,40.804609,-73.937456


In [37]:
#add column of all 1s to each df so we can merge to get all distances
distance_df['merge_column'] = 1
fsq_df_unique['merge_column'] = 1

In [38]:
distance_df = distance_df.merge(right=fsq_df_unique, on='merge_column')

In [39]:
distance_df.drop('merge_column', axis=1, inplace=True)

In [40]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon
0,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337
1,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648
2,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607
3,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128
4,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529


In [41]:
def distance_to_station(df_row):
    station_lat = df_row.station_lat
    station_lon = df_row.station_lon
    poi_lat = df_row.poi_lat
    poi_lon = df_row.poi_lon
    stat_coords = (station_lat, station_lon)
    poi_coords = (poi_lat, poi_lon)
    return haversine(stat_coords, poi_coords, miles=True)

In [42]:
distance_df['distance'] = distance_df.apply(distance_to_station, axis=1)

In [43]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
0,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337,1.646483
1,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648,0.759627
2,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607,2.156551
3,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128,1.539851
4,34 ST-PENN STA,40.750497,-73.990877,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529,2.001352


In [44]:
distance_df = distance_df[distance_df.distance<1]
distance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3747 entries, 1 to 25462
Data columns (total 8 columns):
station          3747 non-null object
station_lat      3747 non-null float64
station_lon      3747 non-null float64
venueId          3747 non-null object
venueCategory    3747 non-null object
poi_lat          3747 non-null float64
poi_lon          3747 non-null float64
distance         3747 non-null float64
dtypes: float64(5), object(3)
memory usage: 263.5+ KB


In [45]:
with open('distance_df_unique.pickle','wb') as f:
    pickle.dump(distance_df, f)

In [58]:
distance_df.groupby(by=['venueCategory','station'], as_index=False).count().sort_values(['venueCategory','poi_lat'], ascending=[True,False])

Unnamed: 0,venueCategory,station,station_lat,station_lon,venueId,poi_lat,poi_lon,distance
4,Antique Shop,42 ST-PORT AUTH,4,4,4,4,4,4
6,Antique Shop,TIMES SQ-42 ST,4,4,4,4,4,4
0,Antique Shop,14 ST-UNION SQ,3,3,3,3,3,3
5,Antique Shop,GRD CNTRL-42 ST,3,3,3,3,3,3
1,Antique Shop,23 ST,2,2,2,2,2,2
2,Antique Shop,34 ST-HERALD SQ,1,1,1,1,1,1
3,Antique Shop,34 ST-PENN STA,1,1,1,1,1,1
9,Bridal Shop,34 ST-HERALD SQ,5,5,5,5,5,5
7,Bridal Shop,14 ST-UNION SQ,4,4,4,4,4,4
8,Bridal Shop,23 ST,4,4,4,4,4,4
