In [39]:
import pandas as pd
import pickle
from haversine import haversine

In [40]:
with open('foursquare_POIs_only.pickle', 'rb') as f:
    fsq_df = pickle.load(f)

In [41]:
fsq_df = fsq_df[['venueId','venueCategory', 'latitude', 'longitude']].rename(columns={'latitude':'poi_lat', 'longitude':'poi_lon'})

In [42]:
fsq_df.shape

(7523, 4)

In [43]:
#reducing observations to unique venues/POIs for distance calculation
fsq_df_unique = fsq_df.groupby(by='venueId', as_index=False).max()
fsq_df_unique.shape

(2553, 4)

In [44]:
with open('weekly_avg_by_station_clean.pickle', 'rb') as f:
    entry_df = pickle.load(f)

In [45]:
entry_df = entry_df.sort_values('ENTRIES', ascending = False)

#only interested in top 10 stations
entry_df = entry_df.iloc[0:10]

In [46]:
#only interested in the station names and orders
distance_df = pd.DataFrame(entry_df.STATION)
distance_df.rename(columns={'STATION': 'station'}, inplace=True)

In [47]:
#manually update latitude/longitude for top 10
stat_lat_lons =[(40.750326,-73.988067), # 34 st - Herald Square
                (40.755453,-73.987285), # Times Square - 42nd st
                (40.750497,-73.990877), #34 st- penn station
                (40.744376,-73.995652), #23 st - assuming 7th Ave and 23rd st 
                (40.768868,-73.980959), #59 St Columbus
                (40.763191,-73.967855), #59 St is 59 & lex
                (40.752962,-73.977219), # grand central - 42 st
                (40.715745,-74.009259), # Chambers St
                (40.758923,-73.981350), #47-50 STS ROCK
                (40.776353,-73.952861)] # 86 St
                

In [48]:
distance_df['station_lat'] = [x[0] for x in stat_lat_lons]
distance_df['station_lon'] = [x[1] for x in stat_lat_lons]
distance_df

Unnamed: 0,station,station_lat,station_lon
39,34 ST-HERALD SQ,40.750326,-73.988067
200,TIMES SQ-42 ST,40.755453,-73.987285
41,34 ST-PENN STA,40.750497,-73.990877
31,23 ST,40.744376,-73.995652
60,59 ST COLUMBUS,40.768868,-73.980959
59,59 ST,40.763191,-73.967855
143,GRD CNTRL-42 ST,40.752962,-73.977219
115,CHAMBERS ST,40.715745,-74.009259
50,47-50 STS ROCK,40.758923,-73.98135
74,86 ST,40.776353,-73.952861


In [49]:
#add column of all 1s to each df so we can merge to get all distances
distance_df['merge_column'] = 1
fsq_df_unique['merge_column'] = 1

distance_df = distance_df.merge(right=fsq_df_unique, on='merge_column')
distance_df.drop('merge_column', axis=1, inplace=True)

In [50]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon
0,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648
2,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607
3,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128
4,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529


In [51]:
def distance_to_station(df_row):
    stat_coords = (df_row.station_lat, df_row.station_lon)
    poi_coords = (df_row.poi_lat, df_row.poi_lon)
    return haversine(stat_coords, poi_coords, miles=True)

In [52]:
distance_df['distance'] = distance_df.apply(distance_to_station, axis=1)

In [53]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
0,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337,1.685613
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648,0.718688
2,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607,2.127333
3,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128,1.571412
4,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529,2.028399


In [54]:
distance_df = distance_df[distance_df.distance < 1]
distance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4014 entries, 1 to 25502
Data columns (total 8 columns):
station          4014 non-null object
station_lat      4014 non-null float64
station_lon      4014 non-null float64
venueId          4014 non-null object
venueCategory    4014 non-null object
poi_lat          4014 non-null float64
poi_lon          4014 non-null float64
distance         4014 non-null float64
dtypes: float64(5), object(3)
memory usage: 282.2+ KB


In [55]:
with open('distance_df_v2.pickle','wb') as f:
    pickle.dump(distance_df, f)

In [56]:
distance_df.groupby(by=['venueCategory','station'], as_index=False).count().sort_values(['venueCategory','poi_lat'], ascending=[True,False])

Unnamed: 0,venueCategory,station,station_lat,station_lon,venueId,poi_lat,poi_lon,distance
3,Antique Shop,47-50 STS ROCK,4,4,4,4,4,4
5,Antique Shop,59 ST COLUMBUS,4,4,4,4,4,4
9,Antique Shop,TIMES SQ-42 ST,4,4,4,4,4,4
4,Antique Shop,59 ST,3,3,3,3,3,3
8,Antique Shop,GRD CNTRL-42 ST,3,3,3,3,3,3
0,Antique Shop,23 ST,2,2,2,2,2,2
1,Antique Shop,34 ST-HERALD SQ,1,1,1,1,1,1
2,Antique Shop,34 ST-PENN STA,1,1,1,1,1,1
6,Antique Shop,86 ST,1,1,1,1,1,1
7,Antique Shop,CHAMBERS ST,1,1,1,1,1,1
