### Imports

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import fiona
import shapely.geometry
import common
from shapely.geometry import Point, Polygon

from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline

### Inputs that will change

In [2]:
number_of_stations = 5
date = date = '09/24/2017'

### Make df of all subway stations

In [3]:
df_columns = ['STATION_ID', 'COMPLEX_ID', 'GTFS_STOP_ID', 'DIVISION', 'LINE', 'STOP_NAME', 'BOROUGH', 'DAYTIME_ROUTES', 'STRUCTURE', 'LATITUDE', 'LONGITUDE']
df_train_stops = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
df_train_stops.columns=df_columns

### Load cleaned turnstile data - CHECK PATH

In [4]:
path = '/Users/murdock/Documents/Metis/MTABenson_metis/pklfiles/'
with open(path + 'another100.pkl', 'rb') as pickle_file:
    df = pickle.load(pickle_file)

In [5]:
df.head()

Unnamed: 0,DATE,STATION,LINENAME,SCP,TIME,ENTRIES,EXITS,ENTRY_DIFF,EXIT_DIFF,HR_DELTA,WEEKDAY,MAX,DATE_OBJ
0,04/30/2016,59 ST,NQR456,02-00-00,0,5639941,1909983,,,,5,,2016-04-30
1,04/30/2016,59 ST,NQR456,02-00-00,4,5639991,1909993,50.0,10.0,4.0,5,2880.0,2016-04-30
2,04/30/2016,59 ST,NQR456,02-00-00,8,5640014,1910024,23.0,31.0,4.0,5,2880.0,2016-04-30
3,04/30/2016,59 ST,NQR456,02-00-00,12,5640158,1910134,144.0,110.0,4.0,5,2880.0,2016-04-30
4,04/30/2016,59 ST,NQR456,02-00-00,16,5640454,1910197,296.0,63.0,4.0,5,2880.0,2016-04-30


### Function to select top stations on a given weekday

In [6]:

def select_stations(date, number_of_stations, df):
    desired_date = datetime.strptime(date, '%m/%d/%Y').weekday()
    df.dropna(how = 'any', inplace=True)
    df['TOTAL'] = df['ENTRY_DIFF'] + df['EXIT_DIFF']
    extract_data_df = df[['DATE_OBJ', 'STATION', 'SCP', 'TOTAL', 'LINENAME', 'WEEKDAY']]
    sorted_df = extract_data_df.sort_values(['WEEKDAY', 'TOTAL'], ascending=[True,False])
    grouped_df = sorted_df.groupby(['WEEKDAY', 'STATION', 'LINENAME']).sum().reset_index().sort_values(['TOTAL'], ascending=False)
    station_df = grouped_df[grouped_df['WEEKDAY']==desired_date][['WEEKDAY', 'STATION', 'LINENAME', 'TOTAL']].iloc[:number_of_stations + 50]
    return station_df
    
stations = select_stations(date, 5, df)

In [7]:
stations.head()

Unnamed: 0,WEEKDAY,STATION,LINENAME,TOTAL
3390,6,GRD CNTRL-42 ST,4567S,54981950000.0
3311,6,CANAL ST,JNQRZ6W,54357200000.0
3166,6,42 ST-PORT AUTH,ACENQRS1237,44207840000.0
3323,6,CHRISTOPHER ST,1,42000980000.0
3167,6,42 ST-PORT AUTH,ACENQRS1237W,41134780000.0


### Function to obtain latitude and longitude for each top station

In [8]:
train_stops_for_mapping = df_train_stops['STOP_NAME']
train_lines_for_mapping = df_train_stops['DAYTIME_ROUTES']

In [9]:
def find_lat_and_long(station_df):
    plotting_data = []
    for i, station in enumerate(station_df['STATION']):
        stop = process.extractOne(station, train_stops_for_mapping, scorer=fuzz.token_set_ratio)
        routes = df_train_stops[df_train_stops['STOP_NAME'] == stop[0]]['DAYTIME_ROUTES']
        line = [li for li in station_df['LINENAME']]
        line2 = process.extractOne(line[i], routes, scorer=fuzz.token_set_ratio)
        a = df_train_stops[(df_train_stops['STOP_NAME']==stop[0])&(df_train_stops['DAYTIME_ROUTES']==line2[0])]['LATITUDE']
        b = df_train_stops[(df_train_stops['STOP_NAME']==stop[0])&(df_train_stops['DAYTIME_ROUTES']==line2[0])]['LONGITUDE']
        plotting_data.append([stop[0], line2[0], [lat for lat in a][0], [long for  long in b][0]])
    return plotting_data


station_coord = find_lat_and_long(stations)
station_coord_df = pd.DataFrame(station_coord, columns=['STATION', 'STOP_NAME', 'LAT', 'LONG'])

In [10]:
station_coord_df.head()

Unnamed: 0,STATION,STOP_NAME,LAT,LONG
0,Grand Central - 42 St,4 5 6,40.751776,-73.976848
1,Canal St,R W,40.719527,-74.001775
2,42 St - Port Authority Bus Terminal,A C E,40.757308,-73.989735
3,Christopher St - Sheridan Sq,1,40.733422,-74.002906
4,42 St - Port Authority Bus Terminal,A C E,40.757308,-73.989735


### Taking top matches and determining block and median income. CHECK FILENAME PATH

In [11]:
filename = '/Users/murdock/Documents/metis/MTABenson_metis/thematic_map_shape/nyshapefile.shp'
blocks = []
with fiona.open(filename) as data:
    for block_data in data:
        geometry = shapely.geometry.shape(block_data['geometry'])
        block = block_data['properties']['GEO_ID']
        blocks.append([block, block_data['geometry']['coordinates']])

In [12]:
latitudes = [lat for lat in station_coord_df['LAT']]
longitudes = [long for long in station_coord_df['LONG']]
stations = [location for location in station_coord_df['STATION']]
stops = [stop for stop in station_coord_df['STOP_NAME']]
coordinates = list(zip(stations, stops, longitudes, latitudes))

In [13]:
geo_id = []
for block in blocks:
    if len(block[1][0]) >= 3:
        poly = Polygon(block[1][0])
        for coord in coordinates:
            point = Point(coord[2:])
            if poly.contains(point):
                #print(block[0])
                geo_id.append([coord, block[0]])
    
print(len(geo_id), len(coordinates))
print(geo_id[0])

50 55
[('59 St', 'N R', -74.017881000000003, 40.641362000000001), '1500000US360470074002']


### Make a df of stations and which block from census data they are located within

In [14]:
columns=['STATION', 'STOP_NAME', 'LATITUDE', 'LONGITUDE', 'GEO_ID']
geo_data = []
for stations in geo_id:
    station = stations[0][0]
    stop = stations[0][1]
    lat = stations[0][3]
    lon = stations[0][2]
    geo = stations[1]
    geo_data.append([station, stop, lat, lon, geo])
income_df = pd.DataFrame(geo_data, columns=columns)      


In [15]:
income_df.head()

Unnamed: 0,STATION,STOP_NAME,LATITUDE,LONGITUDE,GEO_ID
0,59 St,N R,40.641362,-74.017881,1500000US360470074002
1,Times Sq - 42 St,N Q R W,40.754672,-73.986754,1500000US360610113001
2,Times Sq - 42 St,1 2 3,40.75529,-73.987495,1500000US360610113001
3,42 St - Bryant Pk,B D F M,40.754222,-73.984569,1500000US360610113001
4,Coney Island - Stillwell Av,D F N Q,40.577422,-73.981233,1500000US360470350002


### Load census data and merge the df with the previous one to obtain median income near each station. CHECK TABLE PATH

In [16]:
income_table = pd.read_csv('/Users/murdock/Documents/metis/MTABenson_metis/ACS_15_5YR_B19013_with_ann.csv')

In [17]:
income_table = income_table.drop(income_table.index[0])
income_table.columns=['GEO_ID', 'GEO_ID2', 'GEO_DISPLAY_LABEL', 'MEDIAN_INCOME', 'MARGIN_OF_ERROR']
income_table.head()

Unnamed: 0,GEO_ID,GEO_ID2,GEO_DISPLAY_LABEL,MEDIAN_INCOME,MARGIN_OF_ERROR
1,1500000US360050001000,360050001000,"Block Group 0, Census Tract 1, Bronx County, N...",-,**
2,1500000US360050001001,360050001001,"Block Group 1, Census Tract 1, Bronx County, N...",-,**
3,1500000US360050002000,360050002000,"Block Group 0, Census Tract 2, Bronx County, N...",-,**
4,1500000US360050002001,360050002001,"Block Group 1, Census Tract 2, Bronx County, N...",61344,14654
5,1500000US360050002002,360050002002,"Block Group 2, Census Tract 2, Bronx County, N...",71768,27738


In [18]:
final_station_selection_df = pd.merge(income_df, income_table, on='GEO_ID', how='left')

In [19]:
final_station_selection_df.head()

Unnamed: 0,STATION,STOP_NAME,LATITUDE,LONGITUDE,GEO_ID,GEO_ID2,GEO_DISPLAY_LABEL,MEDIAN_INCOME,MARGIN_OF_ERROR
0,59 St,N R,40.641362,-74.017881,1500000US360470074002,360470074002,"Block Group 2, Census Tract 74, Kings County, ...",58804,12572
1,Times Sq - 42 St,N Q R W,40.754672,-73.986754,1500000US360610113001,360610113001,"Block Group 1, Census Tract 113, New York Coun...",82361,29445
2,Times Sq - 42 St,1 2 3,40.75529,-73.987495,1500000US360610113001,360610113001,"Block Group 1, Census Tract 113, New York Coun...",82361,29445
3,42 St - Bryant Pk,B D F M,40.754222,-73.984569,1500000US360610113001,360610113001,"Block Group 1, Census Tract 113, New York Coun...",82361,29445
4,Coney Island - Stillwell Av,D F N Q,40.577422,-73.981233,1500000US360470350002,360470350002,"Block Group 2, Census Tract 350, Kings County,...",28359,20779


### Selects the top stations based on median income

In [20]:
def top_stations(number_of_stations, df):
    selection_df = final_station_selection_df[['STATION', 'STOP_NAME', 'LATITUDE', 'LONGITUDE', 'MEDIAN_INCOME']]
    return selection_df.sort_values('MEDIAN_INCOME', ascending=False)[:number_of_stations]

top_stations(number_of_stations, final_station_selection_df)

Unnamed: 0,STATION,STOP_NAME,LATITUDE,LONGITUDE,MEDIAN_INCOME
15,Delancey St,F,40.718611,-73.988114,93643
19,Bedford Av,L,40.717304,-73.956872,90370
14,2 Av,F,40.723402,-73.989938,89398
42,Grand St,B D,40.718267,-73.993753,86957
5,Lafayette Av,C,40.686113,-73.973946,86028


In [22]:
# income ranges from original census graph - can use to rank neighborhoods
#8,713 - 46,071
#46,087 - 78,068
#78,162 - 126,406
#126,667 - 249,083]