### This notebook will be for processing and cleaning the subway turnstile dataset for 2019
- Will need to determine all the turnstile differences for entries between current and last value
- Same for exits 
- This can give me a representation of how many people enter/exit station X in a given time period
- Do this for only stations in Manhattan

In [1]:
import json
import pandas as pd

### Explanation of linking the 3 datasets to get useful information:
- turnstile_data is the dataset obtained from: https://data.ny.gov/Transportation/Turnstile-Usage-Data-2019/xfn5-qji9
- This dataset contains all the info in 2019 for all the turnstiles in the MTA system
- station_data is a dataset obtained from: http://web.mta.info/developers/data/nyct/subway/Stations.csv
- This dataset contains information for all the stations in the MTA system
- The problem that occurs is how to link the two datasets together
- lookup_data

In [38]:
turnstile_data = pd.read_csv('Turnstile_Usage_Data__2019.csv')
turnstile_data.head()

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,A033,R170,02-00-05,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,17538854,7031168
1,A033,R170,02-00-02,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,14983900,14554087
2,A033,R170,02-06-00,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,769115,559221
3,A033,R170,02-00-03,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,7191422,8417203
4,A033,R170,02-06-01,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,71047673,20925389


In [51]:
with open('stations.csv') as file:
    station_data = pd.read_csv(file)
station_data.head()

Unnamed: 0,ogc_fid,station id,complex id,gtfs stop id,division,line,stop name,borough,daytime routes,structure,gtfs latitude,gtfs longitude,north direction label,south direction label
0,1,1.0,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2.0,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3.0,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4.0,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5.0,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan


In [52]:
with open('remote_complex_lookup.csv') as file:
    lookup_data = pd.read_csv(file)
lookup_data.head()

Unnamed: 0,remote,booth,complex_id,station,line_name,division
0,R001,A060,635.0,WHITEHALL ST,R1,BMT
1,R001,A058,635.0,WHITEHALL ST,R1,BMT
2,R001,R101S,635.0,SOUTH FERRY,R1,IRT
3,R002,A077,628.0,FULTON ST,ACJZ2345,BMT
4,R002,A081,628.0,FULTON ST,ACJZ2345,BMT


In [81]:
#count the number of null values for complex id
lookup_data['complex_id'].isnull().sum()

0

In [83]:
#change the complex id column to object instead of float
lookup_data['complex_id'] = lookup_data['complex_id'].astype(str)
lookup_data.head()

Unnamed: 0,remote,booth,complex_id,station,line_name,division
0,R001,A060,635.0,WHITEHALL ST,R1,BMT
1,R001,A058,635.0,WHITEHALL ST,R1,BMT
2,R001,R101S,635.0,SOUTH FERRY,R1,IRT
3,R002,A077,628.0,FULTON ST,ACJZ2345,BMT
4,R002,A081,628.0,FULTON ST,ACJZ2345,BMT


### First lets look at all the station_data dataset
- First I will drop all rows that are not in Manhattan

In [53]:
for i in range(len(station_data)):
    if station_data['borough'][i] == 'Q' or  station_data['borough'][i] == 'Bk' or station_data['borough'][i] == 'Bx'or station_data['borough'][i] == 'SI':
        station_data.drop(i, inplace=True)

station_data.head()

Unnamed: 0,ogc_fid,station id,complex id,gtfs stop id,division,line,stop name,borough,daytime routes,structure,gtfs latitude,gtfs longitude,north direction label,south direction label
6,7,7.0,613,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn
7,8,8.0,8,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
8,9,9.0,9,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658,Uptown & Queens,Downtown & Brooklyn
9,10,10.0,10,R15,BMT,Broadway - Brighton,49 St,M,N R W,Subway,40.759901,-73.984139,Uptown & Queens,Downtown & Brooklyn
10,11,11.0,611,R16,BMT,Broadway - Brighton,Times Sq - 42 St,M,N Q R W,Subway,40.754672,-73.986754,Uptown & Queens,Downtown & Brooklyn


In [56]:
station_data.tail(20)

Unnamed: 0,ogc_fid,station id,complex id,gtfs stop id,division,line,stop name,borough,daytime routes,structure,gtfs latitude,gtfs longitude,north direction label,south direction label
472,473,475.0,475,Q05,IND,Second Av,96 St,M,Q,Subway,40.784318,-73.947152,,Downtown & Brooklyn
473,474,476.0,476,Q04,IND,Second Av,86 St,M,Q,Subway,40.777891,-73.951787,Uptown,Downtown & Brooklyn
474,475,477.0,477,Q03,IND,Second Av,72 St,M,Q,Subway,40.768799,-73.958424,Uptown,Downtown & Brooklyn
496,497,,R468,,RIT,,Roosevelt Island Tram - Eastbound,,,,40.761186,-73.964191,,
497,498,,R469,,RIT,,Roosevelt Island Tram - Westbound,,,,40.757307,-73.954097,,
498,499,,R540,,PTH,,World Trade Center,,,,40.712052,-74.014129,,
499,500,,R541,,PTH,,33 St,,,,40.74785,-73.989875,,
500,501,,R542,,PTH,,23 St,,,,40.742672,-73.994177,,
501,502,,R543,,PTH,,Exchange Pl,,,,40.716554,-74.033372,,
502,503,,R544,,PTH,,Harrison,,,,40.740472,-74.157344,,


In [64]:
#take out the last 17 station names and put them in a list
station_names = station_data['stop name'].tail(17).tolist()
station_names

['Roosevelt Island Tram - Eastbound',
 'Roosevelt Island Tram - Westbound',
 'World Trade Center',
 '33 St',
 '23 St',
 'Exchange Pl',
 'Harrison',
 '14 St',
 'Pavonia/Newport',
 '9 St',
 'Christopher St',
 'Newark Penn Station',
 'Hoboken',
 'Grove St',
 'Journal Sq',
 'Howard Beach',
 'Jamaica']

In [70]:
station_names_to_drop=['Roosevelt Island Tram - Eastbound',
 'Roosevelt Island Tram - Westbound',
 'Exchange Pl',
 'Harrison',
 'Pavonia/Newport',
 '9 St',
 'Newark Penn Station',
 'Hoboken',
 'Grove St',
 'Journal Sq',
 'Howard Beach',
 'Jamaica']

 # Create a boolean mask to identify rows with names are in the list
mask = station_data['stop name'].isin(station_names_to_drop)

# Apply the mask to drop the rows
station_data = station_data[~mask]

station_data.tail(10)
    

Unnamed: 0,ogc_fid,station id,complex id,gtfs stop id,division,line,stop name,borough,daytime routes,structure,gtfs latitude,gtfs longitude,north direction label,south direction label
470,471,469.0,610,901,IRT,Lexington - Shuttle,Grand Central - 42 St,M,S,Subway,40.752769,-73.979189,Times Sq,
471,472,471.0,471,726,IRT,Flushing,34 St - 11 Av,M,7,Subway,40.755882,-74.00191,Queens,
472,473,475.0,475,Q05,IND,Second Av,96 St,M,Q,Subway,40.784318,-73.947152,,Downtown & Brooklyn
473,474,476.0,476,Q04,IND,Second Av,86 St,M,Q,Subway,40.777891,-73.951787,Uptown,Downtown & Brooklyn
474,475,477.0,477,Q03,IND,Second Av,72 St,M,Q,Subway,40.768799,-73.958424,Uptown,Downtown & Brooklyn
498,499,,R540,,PTH,,World Trade Center,,,,40.712052,-74.014129,,
499,500,,R541,,PTH,,33 St,,,,40.74785,-73.989875,,
500,501,,R542,,PTH,,23 St,,,,40.742672,-73.994177,,
503,504,,R545,,PTH,,14 St,,,,40.738507,-73.998741,,
506,507,,R548,,PTH,,Christopher St,,,,40.732593,-74.008924,,


In [84]:
#set the last 5 rows column[gtfs stop id] to the complex id value
station_data.loc[station_data.tail(5).index, 'gtfs stop id'] = station_data['complex id'].tail(5)

In [88]:
#change complex id in station to have .0 and be of type string
station_data['complex id'] = station_data['complex id'].astype(str)
station_data['complex id'] = station_data['complex id'] + '.0'

### Now match up complex_id in station_data with complex_id in lookup_data

In [95]:
remote_booth_and_station = pd.merge(station_data, lookup_data, left_on='complex id', right_on='complex_id')
remote_booth_and_station.head()

Unnamed: 0,ogc_fid,station id,complex id,gtfs stop id,division_x,line,stop name,borough,daytime routes,structure,gtfs latitude,gtfs longitude,north direction label,south direction label,remote,booth,complex_id,station,line_name,division_y
0,7,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn,R016,N305A,613.0,LEXINGTON-53 ST,EM6,IND
1,7,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn,R017,N306,613.0,LEXINGTON-53 ST,EM6,IND
2,7,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn,R017,N305,613.0,LEXINGTON-53 ST,EM6,IND
3,7,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn,R050,R244,613.0,59 ST,456NQR,IRT
4,7,7.0,613.0,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn,R050,R244A,613.0,59 ST,456NQR,IRT


In [96]:
#drop the following columns: ogc_fid, gtfs stop id, division_x, line, daytime routes, structure, north direction label, south direction label, line_name, division_y
remote_booth_and_station.drop(['ogc_fid', 'complex id','gtfs stop id', 'division_x', 'line', 'daytime routes', 'structure', 'north direction label', 'south direction label', 'line_name', 'division_y'], axis=1, inplace=True)

remote_booth_and_station.head()

Unnamed: 0,station id,stop name,borough,gtfs latitude,gtfs longitude,remote,booth,complex_id,station
0,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R016,N305A,613.0,LEXINGTON-53 ST
1,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R017,N306,613.0,LEXINGTON-53 ST
2,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R017,N305,613.0,LEXINGTON-53 ST
3,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R050,R244,613.0,59 ST
4,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R050,R244A,613.0,59 ST


In [98]:
remote_booth_and_station.head(50)

Unnamed: 0,station id,stop name,borough,gtfs latitude,gtfs longitude,remote,booth,complex_id,station
0,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R016,N305A,613.0,LEXINGTON-53 ST
1,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R017,N306,613.0,LEXINGTON-53 ST
2,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R017,N305,613.0,LEXINGTON-53 ST
3,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R050,R244,613.0,59 ST
4,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R050,R244A,613.0,59 ST
5,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R050,A004,613.0,LEXINGTON AVE
6,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R051,R245,613.0,59 ST
7,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R051,R245A,613.0,59 ST
8,7.0,Lexington Av/59 St,M,40.76266,-73.967258,R051,A002,613.0,LEXINGTON AVE
9,400.0,59 St,M,40.762526,-73.967967,R016,N305A,613.0,LEXINGTON-53 ST


In [97]:
len(remote_booth_and_station)

480

### Lets now look at the turnstile_data dataset
- First I should drop all the rows that concern Manhattan stations only. I can do this ??
- I will convert date and time to unix timestamp, name: timestamp
- I will combine C/A, Unit, SCP and the timestamp to create a unique identifier for each row, name: id
- I will combine C/A, Unit and SCP to create a unique id for each, name: turnstile_id
- I will calculate the entry and exit values for each row, name: entry_diff and exit_diff

### Extract subway names and co-ordinates from overpass turbo json file


In [3]:
#there are 161 stations in this dataset

# Open the geojson file and read it
with open('manSubTurn.geojson') as file:
    data = json.load(file)


subway=pd.DataFrame(columns=['name','short_name','lat','lon','Link to Turnstile Data'])

for i in range(len(data['features'])):
    #Extract 
    name=data['features'][i]['properties']['name']
    try:
        short_n=data['features'][i]['properties']['short_name']
    except:
        short_n='None'
        
    lat=data['features'][i]['geometry']['coordinates'][1]
    lon=data['features'][i]['geometry']['coordinates'][0]
    subway.loc[i]=[name,short_n,lat,lon,'None']
    

In [4]:
subway.tail(20)

Unnamed: 0,name,short_name,lat,lon,Link to Turnstile Data
142,Grand Central Terminal,,40.752806,-73.977179,
143,Marble Hill,,40.874924,-73.912702,
144,Manhattan,,41.418606,-87.989013,
145,59th Street,59 St,40.762707,-73.96788,
146,Chambers Street,Chambers St,40.714916,-74.007968,
147,Fulton Street,Fulton St,40.710186,-74.007664,
148,49th Street,49 St,40.759901,-73.984139,
149,34th Street–Herald Square,34 St–Herald Sq,40.749719,-73.987823,
150,23rd Street,23 St,40.742853,-73.992834,
151,Union Square,Union Sq,40.734603,-73.99036,


In [5]:
#remove duplicates? depends on how granular the data for 2019 is

### Now import the turnstile data for 2019

In [23]:
with open('Turnstile_Usage_Data__2019.csv') as file:
    turn_data = pd.read_csv(file)

turn_data.head(20)

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,A033,R170,02-00-05,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,17538854,7031168
1,A033,R170,02-00-02,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,14983900,14554087
2,A033,R170,02-06-00,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,769115,559221
3,A033,R170,02-00-03,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,7191422,8417203
4,A033,R170,02-06-01,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,71047673,20925389
5,A033,R170,02-00-01,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,2430093,2921770
6,A033,R170,02-00-00,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,271981,828662
7,A033,R170,02-00-04,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,6483080,4945335
8,A034,R170,03-00-02,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,4632187,3269462
9,A034,R170,03-03-00,14 ST-UNION SQ,LNQR456W,BMT,12/27/2019,00:00:00,REGULAR,69926,219187


In [24]:
#find how many unique subway names there are
turn_data['Station'].nunique()

379

#### Columns of interest
- Station, this is the station name which I will compare with Overpass turbo name data to only have manhattan turnstile data
- Date, needed for determing day of the week and time of year
- Time, needed for determing time of day
- Entries, needed for determing how many people entered the station, useful for busyness
- Exits, needed for determing how many people exited the station, useful for busyness
- Might also need to include individual turnstile data to see where people are entering/exiting the station and for comparing that turnstile to its previous value to see the difference

### Drop the rows that are not in Manhattan
#### Change overpass turbo names to match turnstile names

In [8]:
#capitalize all overpass turbo names
subway['name'] = subway['name'].str.upper()
subway['short_name'] = subway['short_name'].str.upper()
subway.head(20)

Unnamed: 0,name,short_name,lat,lon,Link to Turnstile Data
0,96TH STREET,96 ST,40.794379,-73.972,
1,BOWLING GREEN,NONE,40.704509,-74.014095,
2,SOUTH FERRY,NONE,40.70171,-74.013149,
3,86TH STREET,86 ST,40.779494,-73.955529,
4,28TH STREET,28 ST,40.743314,-73.984075,
5,14TH STREET–UNION SQUARE,14 ST–UNION SQ,40.735761,-73.990649,
6,28TH STREET,28 ST,40.745494,-73.988691,
7,CANAL STREET,CANAL ST,40.718092,-73.999892,
8,BROADWAY–LAFAYETTE STREET,B’WAY–LAFAYETTE ST,40.725297,-73.996204,
9,5TH AVENUE,5 AV,40.753743,-73.9819,


In [9]:
# Now search for the short name in the turnstile data
# If it is not there, search for the long name

stations_to_extract_from_turnstile = []
change_name_stations_to_extract_from_turnstile = []
for i in range(len(subway)):
    name = subway.iloc[i]['name']
    short_name = subway.iloc[i]['short_name']
    if short_name in turn_data['Station'].values:
        stations_to_extract_from_turnstile.append(short_name)
        subway.loc[i,'Link to Turnstile Data'] = short_name
    elif name in turn_data['Station'].values:
        stations_to_extract_from_turnstile.append(name)
        subway.loc[i,'Link to Turnstile Data'] = name
    else:
        change_name_stations_to_extract_from_turnstile.append('None for '+name)
        

print(stations_to_extract_from_turnstile)

['96 ST', 'BOWLING GREEN', 'SOUTH FERRY', '86 ST', '28 ST', '28 ST', 'CANAL ST', 'RECTOR ST', 'CHAMBERS ST', 'WALL ST', 'EAST BROADWAY', 'GRAND ST', 'BOWERY', 'BROAD ST', '2 AV', '51 ST', '7 AV', 'PRINCE ST', 'CANAL ST', '23 ST', 'ASTOR PL', 'BLEECKER ST', 'SPRING ST', '33 ST', '1 AV', 'CANAL ST', '14 ST', '3 AV', 'SPRING ST', '23 ST', '28 ST', '50 ST', 'HOUSTON ST', '18 ST', 'FRANKLIN ST', '72 ST', '77 ST', '96 ST', 'DYCKMAN ST', '125 ST', '125 ST', '135 ST', '145 ST', '79 ST', '86 ST', '110 ST', '116 ST', '103 ST', '125 ST', '155 ST', '116 ST', '103 ST', '116 ST', '14 ST', '57 ST', 'CITY HALL', '155 ST', 'CHAMBERS ST', 'PARK PLACE', 'CORTLANDT ST', 'RECTOR ST', '23 ST', 'DYCKMAN ST', '168 ST', '181 ST', '191 ST', '207 ST', '215 ST', '145 ST', '157 ST', '103 ST', '72 ST', '86 ST', '96 ST', 'CANAL ST', '9TH STREET', '14 ST', '14 ST', '33 ST', 'FULTON ST', 'WALL ST', '125 ST', '145 ST', '181 ST', '190 ST', '23 ST', '23 ST', 'CHRISTOPHER ST', '135 ST', '86 ST', '96 ST', '72 ST', 'CANAL S

In [10]:
subway.head(20)

Unnamed: 0,name,short_name,lat,lon,Link to Turnstile Data
0,96TH STREET,96 ST,40.794379,-73.972,96 ST
1,BOWLING GREEN,NONE,40.704509,-74.014095,BOWLING GREEN
2,SOUTH FERRY,NONE,40.70171,-74.013149,SOUTH FERRY
3,86TH STREET,86 ST,40.779494,-73.955529,86 ST
4,28TH STREET,28 ST,40.743314,-73.984075,28 ST
5,14TH STREET–UNION SQUARE,14 ST–UNION SQ,40.735761,-73.990649,
6,28TH STREET,28 ST,40.745494,-73.988691,28 ST
7,CANAL STREET,CANAL ST,40.718092,-73.999892,CANAL ST
8,BROADWAY–LAFAYETTE STREET,B’WAY–LAFAYETTE ST,40.725297,-73.996204,
9,5TH AVENUE,5 AV,40.753743,-73.9819,


In [11]:
print(change_name_stations_to_extract_from_turnstile)
#will need to change these names manually

['None for 14TH STREET–UNION SQUARE', 'None for BROADWAY–LAFAYETTE STREET', 'None for 5TH AVENUE', 'None for 42ND STREET–GRAND CENTRAL', 'None for 5TH AVENUE–59TH STREET', 'None for HARLEM–148TH STREET', 'None for 163RD STREET–AMSTERDAM AVENUE', 'None for WTC CORTLANDT', 'None for WEST 4TH STREET–WASHINGTON SQUARE', 'None for ESSEX STREET', 'None for LEXINGTON AVENUE–63RD STREET', 'None for 42ND STREET–BRYANT PARK', 'None for LEXINGTON AVENUE–59TH STREET', 'None for 5TH AVENUE–53RD STREET', 'None for 57TH STREET–7TH AVENUE', 'None for 8TH STREET–NEW YORK UNIVERSITY', 'None for 42ND STREET–PORT AUTHORITY BUS TERMINAL', 'None for ROOSEVELT ISLAND', 'None for NEW YORK PENN STATION', 'None for 66TH STREET–LINCOLN CENTER', 'None for WORLD TRADE CENTER', 'None for CENTRAL PARK NORTH–110TH STREET', 'None for BROOKLYN BRIDGE–CITY HALL', 'None for 42ND STREET–TIMES SQUARE', 'None for MARBLE HILL–225TH STREET', 'None for 137TH STREET–CITY COLLEGE', 'None for CATHEDRAL PARKWAY–110TH STREET', 'Non

In [12]:
print(len(change_name_stations_to_extract_from_turnstile))

56


In [13]:
change_name_stations_to_extract_from_turnstile.append('14 ST-UNION SQ')
change_name_stations_to_extract_from_turnstile.append('B\'WAY-LAFAYETTE')
change_name_stations_to_extract_from_turnstile.append('5 AVE')
change_name_stations_to_extract_from_turnstile.append('GRD CNTRL-42 ST')
change_name_stations_to_extract_from_turnstile.append('5 AV/59 ST')
change_name_stations_to_extract_from_turnstile.append('HARLEM 148 ST')
change_name_stations_to_extract_from_turnstile.append('163 ST-AMSTERDM')
change_name_stations_to_extract_from_turnstile.append('WTC-CORTLANDT')
change_name_stations_to_extract_from_turnstile.append('W 4 ST-WASH SQ')
change_name_stations_to_extract_from_turnstile.append('DELANCEY/ESSEX')
change_name_stations_to_extract_from_turnstile.append('LEXINGTON AV/63')
change_name_stations_to_extract_from_turnstile.append('42 ST-BRYANT PK')
change_name_stations_to_extract_from_turnstile.append('None')
change_name_stations_to_extract_from_turnstile.append('5 AV/53 ST')
change_name_stations_to_extract_from_turnstile.append('57 ST-7 AV')
change_name_stations_to_extract_from_turnstile.append('8 ST-NYU')
change_name_stations_to_extract_from_turnstile.append('None as bus terminal')
change_name_stations_to_extract_from_turnstile.append('ROOSEVELT ISLND')
change_name_stations_to_extract_from_turnstile.append('34 ST-PENN STA')
change_name_stations_to_extract_from_turnstile.append('66 ST-LINCOLN')
change_name_stations_to_extract_from_turnstile.append('WORLD TRADE CTR')
change_name_stations_to_extract_from_turnstile.append('CENTRAL PK N110')
change_name_stations_to_extract_from_turnstile.append('BROOKLYN BRIDGE')
change_name_stations_to_extract_from_turnstile.append('TIMES SQ-42 ST')
change_name_stations_to_extract_from_turnstile.append('MARBLE HILL-225')
change_name_stations_to_extract_from_turnstile.append('137 ST CITY COL')
change_name_stations_to_extract_from_turnstile.append('CATHEDRAL PKWY')
change_name_stations_to_extract_from_turnstile.append('116 ST-COLUMBIA')
change_name_stations_to_extract_from_turnstile.append('34 ST-PENN STA')
change_name_stations_to_extract_from_turnstile.append('TIMES SQ-42 ST')
change_name_stations_to_extract_from_turnstile.append('CHRISTOPHER ST')
change_name_stations_to_extract_from_turnstile.append('CATHEDRAL PKWY')
change_name_stations_to_extract_from_turnstile.append('68ST-HUNTER CO')
change_name_stations_to_extract_from_turnstile.append('34 ST-PENN STA')
change_name_stations_to_extract_from_turnstile.append('LEXINGTON AV/53')
change_name_stations_to_extract_from_turnstile.append('47-50 STS ROCK')
change_name_stations_to_extract_from_turnstile.append('DELANCEY/ESSEX')
change_name_stations_to_extract_from_turnstile.append('34 ST-HERALD SQ')
change_name_stations_to_extract_from_turnstile.append('81 ST-MUSEUM')
change_name_stations_to_extract_from_turnstile.append('59 ST COLUMBUS')
change_name_stations_to_extract_from_turnstile.append('WHITEHALL S-FRY')
change_name_stations_to_extract_from_turnstile.append('WORLD TRADE CTR')
change_name_stations_to_extract_from_turnstile.append('14 ST-UNION SQ')
change_name_stations_to_extract_from_turnstile.append('GRD CNTRL-42 ST')
change_name_stations_to_extract_from_turnstile.append('GRD CNTRL-42 ST')
change_name_stations_to_extract_from_turnstile.append('INWOOD-207 ST')
change_name_stations_to_extract_from_turnstile.append('125 ST')
change_name_stations_to_extract_from_turnstile.append('MARBLE HILL-225')
change_name_stations_to_extract_from_turnstile.append('GRD CNTRL-42 ST')
change_name_stations_to_extract_from_turnstile.append('34 ST-HERALD SQ')
change_name_stations_to_extract_from_turnstile.append('14 ST-UNION SQ')
change_name_stations_to_extract_from_turnstile.append('34 ST-HUDSON YD')
change_name_stations_to_extract_from_turnstile.append('59 ST COLUMBUS')
change_name_stations_to_extract_from_turnstile.append('TIMES SQ-42 ST')
change_name_stations_to_extract_from_turnstile.append('TIMES SQ-42 ST')
change_name_stations_to_extract_from_turnstile.append('TIMES SQ-42 ST')

In [14]:
len(change_name_stations_to_extract_from_turnstile)

112

- Missing for LEXINGTON AVENUE–59TH STREET

- Have a triple of penn station 34 st
- Duplicate of TIMES SQ-42 ST
- Duplicate of CATHEDRAL PKWY
- Duplicate of DELANCEY/ESSEX
- Duplicate of WORLD TRADE CTR
- Duplicate of 14 ST-UNION SQ
- Quadruple of GRD CNTRL-42 ST
- Duplicate of 125 ST
- Duplicate of MARBLE HILL-225
- 



In [15]:
stations_to_extract_from_turnstile = []
j=56
#change_name_stations_to_extract_from_turnstile = []
for i in range(len(subway)):
    name = subway.iloc[i]['name']
    short_name = subway.iloc[i]['short_name']
    if short_name in turn_data['Station'].values:
        stations_to_extract_from_turnstile.append(short_name)
        subway.loc[i,'Link to Turnstile Data'] = short_name
    elif name in turn_data['Station'].values:
        stations_to_extract_from_turnstile.append(name)
        subway.loc[i,'Link to Turnstile Data'] = name
    else:
        subway.loc[i,'Link to Turnstile Data'] = change_name_stations_to_extract_from_turnstile[j]
        j=j+1

subway.head(20)

Unnamed: 0,name,short_name,lat,lon,Link to Turnstile Data
0,96TH STREET,96 ST,40.794379,-73.972,96 ST
1,BOWLING GREEN,NONE,40.704509,-74.014095,BOWLING GREEN
2,SOUTH FERRY,NONE,40.70171,-74.013149,SOUTH FERRY
3,86TH STREET,86 ST,40.779494,-73.955529,86 ST
4,28TH STREET,28 ST,40.743314,-73.984075,28 ST
5,14TH STREET–UNION SQUARE,14 ST–UNION SQ,40.735761,-73.990649,14 ST-UNION SQ
6,28TH STREET,28 ST,40.745494,-73.988691,28 ST
7,CANAL STREET,CANAL ST,40.718092,-73.999892,CANAL ST
8,BROADWAY–LAFAYETTE STREET,B’WAY–LAFAYETTE ST,40.725297,-73.996204,B'WAY-LAFAYETTE
9,5TH AVENUE,5 AV,40.753743,-73.9819,5 AVE


In [16]:
#show the entire data frame
#pd.set_option('display.max_rows', None)
#subway

In [17]:
#remove any row that has a duplicate value in the link to turnstile data column
subway = subway.drop_duplicates(subset=['Link to Turnstile Data'], keep='first')
len(subway)

96

### Now have 96 subway locations in Manhattan that we can reference in the turnstile data


#### is is it 146 from station_link?

In [25]:
#station_link['station']
man_turn_data = pd.merge(stations_link, turn_data, left_on='station', right_on='Station')
man_turn_data.head(20)

Unnamed: 0,station,line_names,division,borough,subregion,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,1 AV,L,BMT,Manhattan,14 to 42,H007,R248,00-03-00,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,370818021,387993138
1,1 AV,L,BMT,Manhattan,14 to 42,H007,R248,00-00-00,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,15302393,17102988
2,1 AV,L,BMT,Manhattan,14 to 42,H007,R248,00-03-01,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,2569788,1109743
3,1 AV,L,BMT,Manhattan,14 to 42,H007,R248,00-03-02,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,6577135,553888
4,1 AV,L,BMT,Manhattan,14 to 42,H007,R248,00-00-01,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,60949253,38064126
5,1 AV,L,BMT,Manhattan,14 to 42,H007A,R248,02-03-01,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,14996,117639
6,1 AV,L,BMT,Manhattan,14 to 42,H007A,R248,02-03-00,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,3835,192369
7,1 AV,L,BMT,Manhattan,14 to 42,H007A,R248,02-03-04,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,84669,12012
8,1 AV,L,BMT,Manhattan,14 to 42,H007A,R248,02-03-03,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,50430,29628
9,1 AV,L,BMT,Manhattan,14 to 42,H007A,R248,02-03-02,1 AV,L,BMT,12/27/2019,03:00:00,REGULAR,32577,62861


In [18]:
#trim the csv data to a new file containg the stations in manhattan
#this will be the data we use for the rest of the project

man_turn_data = pd.DataFrame(columns=turn_data.columns)  # Initialize an empty DataFrame with the same columns as turn_data
'''
for i in range(len(subway)):
    station_to_find=subway['Link to Turnstile Data'][i]
    print(station_to_find)
    for j in range(len(turn_data)):
        if turn_data['Station'][j] == station_to_find:
            man_turn_data = man_turn_data.append(turn_data.iloc[j], ignore_index=True)

man_turn_data
'''

"\nfor i in range(len(subway)):\n    station_to_find=subway['Link to Turnstile Data'][i]\n    print(station_to_find)\n    for j in range(len(turn_data)):\n        if turn_data['Station'][j] == station_to_find:\n            man_turn_data = man_turn_data.append(turn_data.iloc[j], ignore_index=True)\n\nman_turn_data\n"

In [19]:
man_turn_data = pd.merge(subway, turn_data, left_on='Link to Turnstile Data', right_on='Station')


In [20]:
man_turn_data.head(10)

Unnamed: 0,name,short_name,lat,lon,Link to Turnstile Data,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-00-02,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,7050714,3195113
1,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-02-00,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,6592911,7757191
2,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-03-02,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,11982395,11692946
3,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-00-00,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,1875264,1933375
4,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-00-01,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,6189156,2545523
5,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-03-01,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,4028999,5063535
6,96TH STREET,96 ST,40.794379,-73.972,96 ST,R168A,R168,00-03-00,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,4580292,10195272
7,96TH STREET,96 ST,40.794379,-73.972,96 ST,R169,R168,01-00-01,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,1620619,988385
8,96TH STREET,96 ST,40.794379,-73.972,96 ST,R169,R168,01-03-02,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,9245121,8843365
9,96TH STREET,96 ST,40.794379,-73.972,96 ST,R169,R168,01-05-01,96 ST,123,IRT,12/27/2019,00:00:00,REGULAR,0,1607


In [21]:
len(man_turn_data)
#4.5 million rows, which is 5.6 less than the original 


4576249

### feature selection


In [22]:
#lets drop:
#short_name, link to turnstile data, Unit,  Line Name, Division
# if desc is recover aud disregard those rows?
man_turn_data.drop(['short_name','Link to Turnstile Data','Unit','Line Name','Division'], axis=1, inplace=True)

In [23]:
man_turn_data.head(20)

Unnamed: 0,name,lat,lon,C/A,SCP,Station,Date,Time,Description,Entries,Exits
0,96TH STREET,40.794379,-73.972,R168A,00-00-02,96 ST,12/27/2019,00:00:00,REGULAR,7050714,3195113
1,96TH STREET,40.794379,-73.972,R168A,00-02-00,96 ST,12/27/2019,00:00:00,REGULAR,6592911,7757191
2,96TH STREET,40.794379,-73.972,R168A,00-03-02,96 ST,12/27/2019,00:00:00,REGULAR,11982395,11692946
3,96TH STREET,40.794379,-73.972,R168A,00-00-00,96 ST,12/27/2019,00:00:00,REGULAR,1875264,1933375
4,96TH STREET,40.794379,-73.972,R168A,00-00-01,96 ST,12/27/2019,00:00:00,REGULAR,6189156,2545523
5,96TH STREET,40.794379,-73.972,R168A,00-03-01,96 ST,12/27/2019,00:00:00,REGULAR,4028999,5063535
6,96TH STREET,40.794379,-73.972,R168A,00-03-00,96 ST,12/27/2019,00:00:00,REGULAR,4580292,10195272
7,96TH STREET,40.794379,-73.972,R169,01-00-01,96 ST,12/27/2019,00:00:00,REGULAR,1620619,988385
8,96TH STREET,40.794379,-73.972,R169,01-03-02,96 ST,12/27/2019,00:00:00,REGULAR,9245121,8843365
9,96TH STREET,40.794379,-73.972,R169,01-05-01,96 ST,12/27/2019,00:00:00,REGULAR,0,1607


In [24]:
#group alphabetically by station Date and Time
man_turn_data.sort_values(by=['name'], inplace=True)
man_turn_data.head(20)

Unnamed: 0,name,lat,lon,C/A,SCP,Station,Date,Time,Description,Entries,Exits
3007920,103RD STREET,40.790298,-73.947624,R252,00-00-01,103 ST,11/17/2019,16:00:00,REGULAR,4691357,1945485
3010850,103RD STREET,40.790298,-73.947624,R170,00-00-02,103 ST,10/18/2019,13:00:00,REGULAR,16673625,4540530
3010851,103RD STREET,40.790298,-73.947624,R170,00-00-01,103 ST,10/18/2019,13:00:00,REGULAR,13639657,3993107
3010852,103RD STREET,40.790298,-73.947624,R170,00-03-02,103 ST,10/18/2019,13:00:00,REGULAR,11693983,10975178
3010853,103RD STREET,40.790298,-73.947624,R170,00-00-00,103 ST,10/18/2019,13:00:00,REGULAR,2355096,925048
3010854,103RD STREET,40.790298,-73.947624,R170,00-00-00,103 ST,10/18/2019,13:00:00,REGULAR,2355096,925048
3010855,103RD STREET,40.790298,-73.947624,R170,00-03-01,103 ST,10/18/2019,13:00:00,REGULAR,12771924,10179811
3010856,103RD STREET,40.790298,-73.947624,R170,00-03-01,103 ST,10/18/2019,13:00:00,REGULAR,12771924,10179811
3010857,103RD STREET,40.790298,-73.947624,R170,00-00-01,103 ST,10/18/2019,13:00:00,REGULAR,13639657,3993107
3010858,103RD STREET,40.790298,-73.947624,R170,00-03-02,103 ST,10/18/2019,13:00:00,REGULAR,11693983,10975178


In [25]:
#convert the date and time to unix timestamps
man_turn_data['DateTime'] = pd.to_datetime(man_turn_data['Date'] + ' ' + man_turn_data['Time'])

man_turn_data['UnixTimestamp'] = man_turn_data['DateTime'].apply(lambda x: x.timestamp())
man_turn_data.head(20)


Unnamed: 0,name,lat,lon,C/A,SCP,Station,Date,Time,Description,Entries,Exits,DateTime,UnixTimestamp
3007920,103RD STREET,40.790298,-73.947624,R252,00-00-01,103 ST,11/17/2019,16:00:00,REGULAR,4691357,1945485,2019-11-17 16:00:00,1574006000.0
3010850,103RD STREET,40.790298,-73.947624,R170,00-00-02,103 ST,10/18/2019,13:00:00,REGULAR,16673625,4540530,2019-10-18 13:00:00,1571404000.0
3010851,103RD STREET,40.790298,-73.947624,R170,00-00-01,103 ST,10/18/2019,13:00:00,REGULAR,13639657,3993107,2019-10-18 13:00:00,1571404000.0
3010852,103RD STREET,40.790298,-73.947624,R170,00-03-02,103 ST,10/18/2019,13:00:00,REGULAR,11693983,10975178,2019-10-18 13:00:00,1571404000.0
3010853,103RD STREET,40.790298,-73.947624,R170,00-00-00,103 ST,10/18/2019,13:00:00,REGULAR,2355096,925048,2019-10-18 13:00:00,1571404000.0
3010854,103RD STREET,40.790298,-73.947624,R170,00-00-00,103 ST,10/18/2019,13:00:00,REGULAR,2355096,925048,2019-10-18 13:00:00,1571404000.0
3010855,103RD STREET,40.790298,-73.947624,R170,00-03-01,103 ST,10/18/2019,13:00:00,REGULAR,12771924,10179811,2019-10-18 13:00:00,1571404000.0
3010856,103RD STREET,40.790298,-73.947624,R170,00-03-01,103 ST,10/18/2019,13:00:00,REGULAR,12771924,10179811,2019-10-18 13:00:00,1571404000.0
3010857,103RD STREET,40.790298,-73.947624,R170,00-00-01,103 ST,10/18/2019,13:00:00,REGULAR,13639657,3993107,2019-10-18 13:00:00,1571404000.0
3010858,103RD STREET,40.790298,-73.947624,R170,00-03-02,103 ST,10/18/2019,13:00:00,REGULAR,11693983,10975178,2019-10-18 13:00:00,1571404000.0


In [26]:
#now order by unix timestamp and the name in alphabetical order

man_turn_data = man_turn_data.sort_values(by=['name','UnixTimestamp'])
man_turn_data.head(20)

Unnamed: 0,name,lat,lon,C/A,SCP,Station,Date,Time,Description,Entries,Exits,DateTime,UnixTimestamp
3037515,103RD STREET,40.790298,-73.947624,R252,00-03-02,103 ST,12/29/2018,00:00:00,REGULAR,8476658,26737489,2018-12-29 00:00:00,1546042000.0
3037514,103RD STREET,40.790298,-73.947624,R252,00-00-01,103 ST,12/29/2018,00:00:00,REGULAR,3999244,1607925,2018-12-29 00:00:00,1546042000.0
3037513,103RD STREET,40.790298,-73.947624,R252,00-03-00,103 ST,12/29/2018,00:00:00,REGULAR,408415,557699,2018-12-29 00:00:00,1546042000.0
3037512,103RD STREET,40.790298,-73.947624,R252,00-03-01,103 ST,12/29/2018,00:00:00,REGULAR,1659007,3170302,2018-12-29 00:00:00,1546042000.0
3037511,103RD STREET,40.790298,-73.947624,R170,00-00-00,103 ST,12/29/2018,00:00:00,REGULAR,1634656,656635,2018-12-29 00:00:00,1546042000.0
3037510,103RD STREET,40.790298,-73.947624,R170,00-00-02,103 ST,12/29/2018,00:00:00,REGULAR,16113503,4435458,2018-12-29 00:00:00,1546042000.0
3037509,103RD STREET,40.790298,-73.947624,R170,00-03-00,103 ST,12/29/2018,00:00:00,REGULAR,8149599,4546826,2018-12-29 00:00:00,1546042000.0
3037508,103RD STREET,40.790298,-73.947624,R170,00-03-02,103 ST,12/29/2018,00:00:00,REGULAR,11186555,10491083,2018-12-29 00:00:00,1546042000.0
3037507,103RD STREET,40.790298,-73.947624,R170,00-03-01,103 ST,12/29/2018,00:00:00,REGULAR,12358341,9863905,2018-12-29 00:00:00,1546042000.0
3037506,103RD STREET,40.790298,-73.947624,R170,00-00-01,103 ST,12/29/2018,00:00:00,REGULAR,13143142,3857058,2018-12-29 00:00:00,1546042000.0


- needed a combination of C/A and SCP to differentiate each turnstile at each time for each day

In [27]:
man_turn_data.tail(50)

Unnamed: 0,name,lat,lon,C/A,SCP,Station,Date,Time,Description,Entries,Exits,DateTime,UnixTimestamp
924855,WTC CORTLANDT,40.711115,-74.01227,R109,03-00-02,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,299699,126691,2019-12-27 16:00:00,1577462000.0
924854,WTC CORTLANDT,40.711115,-74.01227,R108A,05-03-00,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,52502,492805,2019-12-27 16:00:00,1577462000.0
924853,WTC CORTLANDT,40.711115,-74.01227,R108A,05-00-01,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,30074,120271,2019-12-27 16:00:00,1577462000.0
924852,WTC CORTLANDT,40.711115,-74.01227,R108A,05-00-00,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,18439,345021,2019-12-27 16:00:00,1577462000.0
924851,WTC CORTLANDT,40.711115,-74.01227,R108,02-00-04,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,130025,5612,2019-12-27 16:00:00,1577462000.0
924850,WTC CORTLANDT,40.711115,-74.01227,R108,02-00-01,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,124235,185865,2019-12-27 16:00:00,1577462000.0
924849,WTC CORTLANDT,40.711115,-74.01227,R108,02-00-02,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,177185,74448,2019-12-27 16:00:00,1577462000.0
924848,WTC CORTLANDT,40.711115,-74.01227,R108,02-00-00,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,88122,565721,2019-12-27 16:00:00,1577462000.0
924847,WTC CORTLANDT,40.711115,-74.01227,R108,02-00-03,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,181054,23472,2019-12-27 16:00:00,1577462000.0
924828,WTC CORTLANDT,40.711115,-74.01227,R106,01-00-04,WTC-CORTLANDT,12/27/2019,16:00:00,REGULAR,705,1584,2019-12-27 16:00:00,1577462000.0


In [28]:
#man_turn_data.to_csv('man_turn_data.csv')