In [345]:
import pandas as pd
import numpy as np

In [346]:
# read in data from mta website
data = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_160213.txt')

In [347]:
data.columns

Index([u'C/A', u'UNIT', u'SCP', u'STATION', u'LINENAME', u'DIVISION', u'DATE',
       u'TIME', u'DESC', u'ENTRIES',
       u'EXITS                                                               '],
      dtype='object')

In [348]:
# fix stupid EXITS column heading
data.rename(columns = {'EXITS                                                               ':'EXITS','DATE':'DATE'}
            ,inplace = True)
data.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456,BMT,02/06/2016,03:00:00,REGULAR,5530860,1867106
1,A002,R051,02-00-00,59 ST,NQR456,BMT,02/06/2016,07:00:00,REGULAR,5530874,1867120
2,A002,R051,02-00-00,59 ST,NQR456,BMT,02/06/2016,11:00:00,REGULAR,5530973,1867216
3,A002,R051,02-00-00,59 ST,NQR456,BMT,02/06/2016,15:00:00,REGULAR,5531226,1867280
4,A002,R051,02-00-00,59 ST,NQR456,BMT,02/06/2016,19:00:00,REGULAR,5531634,1867368


In [349]:
# create an identifier for each turnstile at a station
data['TURNSTILE'] = data['UNIT'] + data['SCP']

In [350]:
# look at stations on a given line to make sure we're spelling them right
data['STATION'][data['LINENAME']=='GL'].unique()

array(['METROPOLITAN AV'], dtype=object)

In [351]:
# create a new dataframe with only target station data
#station = data[data['STATION']=='LORIMER ST']
# need to add additional line name criterion for metropolitan av since there are two stations with that name
station = data[(data['STATION']=='METROPOLITAN AV') & (data['LINENAME']=='GL')] 
station.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,TURNSTILE
89754,N409,R268,00-00-00,METROPOLITAN AV,GL,IND,02/06/2016,00:00:00,REGULAR,4228752,9771582,R26800-00-00
89755,N409,R268,00-00-00,METROPOLITAN AV,GL,IND,02/06/2016,04:00:00,REGULAR,4228865,9771821,R26800-00-00
89756,N409,R268,00-00-00,METROPOLITAN AV,GL,IND,02/06/2016,08:00:00,REGULAR,4228882,9771871,R26800-00-00
89757,N409,R268,00-00-00,METROPOLITAN AV,GL,IND,02/06/2016,12:00:00,REGULAR,4228945,9772013,R26800-00-00
89758,N409,R268,00-00-00,METROPOLITAN AV,GL,IND,02/06/2016,16:00:00,REGULAR,4229121,9772446,R26800-00-00


In [352]:
# create lists of turnstiles and dates; these will become our column and index values
turn_list = np.asarray(station.TURNSTILE.unique())
date_list = np.asarray(station.DATE.unique())
print 'Number of turnstiles:',len(turn_list)
print turn_list
print ''
print '- - '*20
print ''
print 'Days of the week:',len(date_list)
print date_list

Number of turnstiles: 7
['R26800-00-00' 'R26800-00-01' 'R26800-00-02' 'R26800-03-00' 'R26800-03-01'
 'R26800-06-00' 'R26800-06-01']

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Days of the week: 6
['02/06/2016' '02/07/2016' '02/08/2016' '02/09/2016' '02/10/2016'
 '02/11/2016']


### This assumes that the number of people passing through a turnstile on a given day is the minimum (beginning of the day) minus the maximum (end of the day) value

In [353]:
my_list = []
for d in date_list:
    for i in turn_list:
        try:
            t_max = max(station.EXITS[(station['TURNSTILE']==i) & (station['DATE']==d)]) 
            t_min = min(station.EXITS[(station['TURNSTILE']==i) & (station['DATE']==d)])
            out = t_max - t_min
            my_list.append(out)
        except ValueError: # adding exception handling for metropolitan ave issue
            my_list.append(out)

In [354]:
# need to reshape the array since the my_list output is a Nx1 array, but we want 7xN
# use len() to account for different numbers of days and turnstiles
turn_array = np.asarray(my_list).reshape(len(date_list),len(turn_list))
turn_list = list(station.TURNSTILE.unique()) # convert to list for easier dataframe creation

In [355]:
final = pd.DataFrame(turn_array,index = date_list, columns = turn_list).T # transpose to make it more intelligible
final

Unnamed: 0,02/06/2016,02/07/2016,02/08/2016,02/09/2016,02/10/2016,02/11/2016
R26800-00-00,1455,1213,1234,1295,1328,320
R26800-00-01,787,696,508,490,552,119
R26800-00-02,391,351,202,227,207,62
R26800-03-00,365,299,264,264,286,92
R26800-03-01,906,754,646,719,737,205
R26800-06-00,249,218,148,183,156,65
R26800-06-01,734,683,450,591,598,309


## Don't forget to change file names below

In [356]:
final.to_csv('USI/station_metropolitan.csv') # changed file names manually on purpose