In [68]:
import pandas as pd
import numpy as np
import pickle
from scipy import stats

In [69]:
with open('distance_df_v2.pickle', 'rb') as f:
    distance_df = pickle.load(f)
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648,0.718688
11,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5204ceb1ee3,Sushi Restaurant,40.738116,-73.987964,0.843644
14,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52083e31ee3,Sushi Restaurant,40.737531,-73.991127,0.898471
17,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5208ae71ee3,Cupcake Shop,40.75714,-73.993508,0.550242
19,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a520b7e51ee3,Clothing Store,40.739744,-73.994964,0.815413


In [70]:
with open('Benson_full_analysis.pickle_v2', 'rb') as f:
    station_entries_df = pickle.load(f)
station_entries_df.head()

Unnamed: 0,STATION,ENTRIES,EXITS,mean_percentage
0,1 AV,108048.545455,121148.909091,1.383897
1,103 ST,173966.090909,108841.727273,2.228176
2,103 ST-CORONA,107435.272727,65010.454545,1.376042
3,104 ST,21531.545455,8188.272727,0.275778
4,110 ST,58447.818182,44527.0,0.748606


In [71]:
distance_by_station = distance_df.groupby(by='station', as_index=False).count()

In [72]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648,0.718688
11,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5204ceb1ee3,Sushi Restaurant,40.738116,-73.987964,0.843644
14,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52083e31ee3,Sushi Restaurant,40.737531,-73.991127,0.898471
17,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5208ae71ee3,Cupcake Shop,40.75714,-73.993508,0.550242
19,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a520b7e51ee3,Clothing Store,40.739744,-73.994964,0.815413


In [73]:
station_priorities = station_entries_df[['STATION', 'ENTRIES']].rename(columns={'ENTRIES': 'avg_entries', 'STATION': 'station'})
station_priorities.head()

Unnamed: 0,station,avg_entries
0,1 AV,108048.545455
1,103 ST,173966.090909
2,103 ST-CORONA,107435.272727
3,104 ST,21531.545455
4,110 ST,58447.818182


In [74]:
station_priorities = station_priorities.merge(distance_by_station, on='station')

In [75]:
station_priorities.head()

Unnamed: 0,station,avg_entries,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
0,23 ST,546372.636364,515,515,515,515,515,515,515
1,34 ST-HERALD SQ,594431.181818,506,506,506,506,506,506,506
2,34 ST-PENN STA,784022.727273,498,498,498,498,498,498,498
3,47-50 STS ROCK,255785.818182,484,484,484,484,484,484,484
4,59 ST,328434.909091,341,341,341,341,341,341,341


In [76]:
station_priorities['avg_entries'] = station_priorities['avg_entries'].astype(int)

In [77]:
station_priorities.drop(['station_lat', 'station_lon', 'venueId', 'venueCategory', 'poi_lat', 'poi_lon'], axis=1, inplace = True)

In [78]:
station_priorities.rename(columns={'distance':'nearby_poi_count'},inplace=True)

In [79]:
station_priorities['entry_weight'] = station_priorities['avg_entries']/station_priorities['avg_entries'].sum()

In [80]:
station_priorities['poi_weight'] = station_priorities['nearby_poi_count']/station_priorities['nearby_poi_count'].sum()

In [81]:
station_priorities['combined_weight_product'] = station_priorities['entry_weight'] * station_priorities['poi_weight']
station_priorities['combined_weight_sum'] = station_priorities['entry_weight'] + station_priorities['poi_weight']
station_priorities['combined_weight_harmonic_mean'] = stats.hmean(station_priorities[['entry_weight','poi_weight']], axis=1)

In [82]:
station_priorities.sort_values('combined_weight_harmonic_mean', inplace=True, ascending=False)

In [83]:
station_priorities

Unnamed: 0,station,avg_entries,nearby_poi_count,entry_weight,poi_weight,combined_weight_product,combined_weight_sum,combined_weight_harmonic_mean
2,34 ST-PENN STA,784022,498,0.165055,0.124066,0.020478,0.289121,0.141655
8,GRD CNTRL-42 ST,720653,483,0.151714,0.120329,0.018256,0.272043,0.134211
1,34 ST-HERALD SQ,594431,506,0.125142,0.126059,0.015775,0.2512,0.125598
0,23 ST,546372,515,0.115024,0.128301,0.014758,0.243325,0.1213
9,TIMES SQ-42 ST,463714,468,0.097623,0.116592,0.011382,0.214215,0.106267
4,59 ST,328434,341,0.069143,0.084953,0.005874,0.154096,0.076237
5,59 ST COLUMBUS,347784,318,0.073217,0.079223,0.0058,0.152439,0.076101
3,47-50 STS ROCK,255785,484,0.053849,0.120578,0.006493,0.174427,0.074449
7,CHAMBERS ST,287219,295,0.060466,0.073493,0.004444,0.133959,0.066346
6,86 ST,421655,106,0.088768,0.026408,0.002344,0.115176,0.040706


In [84]:
#distance_df.groupby(by=['venueCategory','station'], as_index=False).count().sort_values(['venueCategory','poi_lat'], ascending=[True,False])

In [85]:
with open('final_station_priorities_v2.pickle', 'wb') as f:
    pickle.dump(station_priorities, f)
    