<a href="https://colab.research.google.com/github/CUSPADS2022IBX/IBXRidership/blob/main/Turnstile%20Data%20Processing/MTA_turnstile_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Source: http://web.mta.info/developers/turnstile.html

Example:

The data below shows the entry/exit register values for one turnstile at control area (A002) from 09/27/14 at 00:00 hours to 09/29/14 at 00:00 hours

Schema Example:
C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
A002,R051,02-00-00,LEXINGTON AVE,456NQR,BMT,09-27-14,00:00:00,REGULAR,0004800073,0001629137,

Data cleaning and processing resources used:

1)https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0

2)https://toddwschneider.com/dashboards/nyc-subway-turnstiles/#notes


In [72]:
!pip install pyspark
!pip install --upgrade xlrd



In [124]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark import SparkFiles

sc = pyspark.SparkContext.getOrCreate()
spark = SparkSession(sc)

#The start date has to match the date of the first URL in the MTA turnstile data, otherwise the URL pull will not work. Data is reported every Saturday.
#NOTE: On November 20, 2021 MTA changed their normal turnstile count periods from [12AM, 4AM, 8AM, 12PM, 4PM, 8PM] to [3AM, 7AM, 11AM, 3PM, 7PM, 11PM]
start_date = date(2022,4, 2)
end_date = date(2022, 4, 23)

#Create a list of dates for the date range requested
date_range = list(pd.date_range(start_date, end_date, freq='7D').strftime("%y%m%d"))

#MTA tunrstile schema, 'EXITS' kepts giving nulls when imported as IntegerType
mta_turnstile_schema = T.StructType([
  T.StructField('C/A', T.StringType(), True),
  T.StructField('UNIT', T.StringType(), True),
  T.StructField('SCP', T.StringType(), True),
  T.StructField('STATION', T.StringType(), True),
  T.StructField('LINENAME', T.StringType(), True),
  T.StructField('DIVISION', T.StringType(), True),
  T.StructField('DATE', T.StringType(), True),
  T.StructField('TIME', T.StringType(), True),
  T.StructField('DESC', T.StringType(), True),
  T.StructField('ENTRIES', T.IntegerType(), True),
  T.StructField('EXITS', T.FloatType(), True),
  ])

#Create empty dataframe with previous scheme
bigdf = spark.createDataFrame([], mta_turnstile_schema)

#Download each .txt file on to Spark job node and load into Spark DataFrame and union onto Empty DataFrame we created
for date_string in date_range:
  url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'.format(date_string)
  spark.sparkContext.addFile(url)
  df = spark.read.csv(SparkFiles.get('turnstile_{}.txt'.format(date_string)), mta_turnstile_schema, header=True)
  bigdf = bigdf.union(df)

#Change 'EXITS' column data type to IntegerType and concate 'DATE' and 'TIME' columns and cast to datetime
bigdf = bigdf.withColumn('EXITS',bigdf.EXITS.cast(T.IntegerType()))\
             .withColumn('timestamp',
                         F.unix_timestamp(F.concat(bigdf.DATE,bigdf.TIME),'MM/dd/yyyyHH:mm:ss').cast('timestamp'))

#Create columns to represent unique observation id, and unique turnstile id for data processing             
bigdf = bigdf.withColumn('unit_division', F.concat(bigdf.UNIT,bigdf.DIVISION))\
             .withColumn('unit_id', F.concat(bigdf['C/A'],bigdf.UNIT,bigdf.SCP))

#Use utility function window to partition by turnstile and order by timestamp
window = Window.partitionBy('unit_id').orderBy('timestamp')

#Use previous window to find the 'net_entries' and 'net_exits'. Remove all entries that are above 10000, because
#turnstiles act as odometers, and when turnstile reaches end it resets creating a large value. 10000 is a good cutoff.
#Also drop first rows of each turnstile data, because .lag function creates None for first row.
bigdf = bigdf.withColumn('net_entries', F.abs(F.col('ENTRIES') - F.lag(F.col('ENTRIES'), 1).over(window)))\
             .withColumn('net_exits', F.abs(F.col('EXITS') - F.lag(F.col('EXITS'), 1).over(window)))\
             .where((F.col('net_entries')<1000000) |\
                    (F.col('net_exits')<1000000) |\
                    (F.col('net_entries')!=None) |\
                    (F.col('net_exits')!=None))
             
#Aggregate on complex_id, weekend/weekday statu, Morning, Evening, Overnight
entry_exit_df = bigdf.groupBy('timestamp','unit_division').sum('net_entries','net_exits').withColumnRenamed('sum(net_entries)', 'entries')\
                                                                                         .withColumnRenamed('sum(net_exits)', 'exits')

#Create new column for 'DOW' (day of week) to aggregate by weekends and weekdays.
entry_exit_df = entry_exit_df.withColumn('DOW', F.when((F.dayofweek(F.col('timestamp'))<7) & (F.dayofweek(F.col('timestamp'))>1),'weekday')\
                                         .when((F.dayofweek(F.col('timestamp'))==7) | (F.dayofweek(F.col('timestamp'))==1),'weekend'))

#Creates new column 'TOD' (Time of Day)
#NOTE: If you want to analyze data before 11/20/2021 you will have to agregate at different hour intervals
entry_exit_df = entry_exit_df.withColumn('TOD', F.when((F.date_format(F.col('timestamp'), 'HH:mm:ss')> '23:00:00') | (F.date_format(F.col('timestamp'), 'HH:mm:ss')<= '07:00:00'), 'overnight')\
                                         .when((F.date_format(F.col('timestamp'), 'HH:mm:ss')> '07:00:00') & (F.date_format(F.col('timestamp'), 'HH:mm:ss')<= '15:00:00'), 'morning')\
                                         .when((F.date_format(F.col('timestamp'), 'HH:mm:ss')> '11:00:00') & (F.date_format(F.col('timestamp'), 'HH:mm:ss')<= '23:00:00'), 'evening'))

#Upload Remote_complex_lookup table and create key table for unit_division join
#Manually checked if complex_id was correct (google sheets for reference: https://docs.google.com/spreadsheets/d/1kMmoqzq3uWM5J8Esrzi1DPEBrdezzVtQ1Rv5ZAIsEfk/edit?usp=sharing)
remote_complex_url = 'https://raw.githubusercontent.com/qri-io/data-stories-scripts/master/nyc-turnstile-counts/lookup/remote_complex_lookup.csv'
remote_complex = pd.read_csv(remote_complex_url).sort_values('station')
remote_complex['complex_id'] = remote_complex['complex_id'].astype('Int64').astype('str')
remote_complex['unit_division'] = remote_complex['remote ']+remote_complex['division']
remote_complex_spark = spark.createDataFrame(remote_complex)

#join to the entry_exit_df to create unique complex_Id column to aggregate on
entry_exit_df = entry_exit_df.join(remote_complex_spark, entry_exit_df.unit_division==remote_complex_spark.unit_division, 'left')\
                             .select('entries','exits','DOW','TOD','complex_id').dropna()

In [125]:
entry_exit_df.show(50)

+-------+-----+-------+---------+----------+
|entries|exits|    DOW|      TOD|complex_id|
+-------+-----+-------+---------+----------+
|   2446| 1034|weekday|  evening|       613|
|   3252| 1524|weekday|  evening|         8|
|   3252| 1524|weekday|  evening|         8|
|   1740| 5926|weekday|  morning|         9|
|   1740| 5926|weekday|  morning|         9|
|    218| 1838|weekday|  morning|        10|
|    218| 1838|weekday|  morning|        10|
|    218| 1838|weekday|  morning|        10|
|    218| 1838|weekday|  morning|        10|
|   1979| 3230|weekday|  evening|       611|
|    475|  806|weekend|overnight|       607|
|    546|  433|weekend|overnight|       607|
|     53|  492|weekend|  morning|       623|
|     53|  492|weekend|  morning|       623|
|    579|  371|weekday|overnight|       623|
|    579|  371|weekday|overnight|       623|
|   1033| 2193|weekday|  morning|       635|
|   1033| 2193|weekday|  morning|       635|
|    311|  663|weekend|  morning|       625|
|    350| 

In [129]:
#Aggregate on complex_id, weekend/weekday statu, Morning, Evening, Overnight
final_agg_df = entry_exit_df.groupBy('DOW','TOD','complex_id').mean('entries','exits').sort(F.col('complex_ID'))

weekend_morning = final_agg_df.filter((final_agg_df.DOW == 'weekend') & (final_agg_df.TOD == 'morning')).toPandas()
weekend_evening = final_agg_df.filter((final_agg_df.DOW == 'weekend') & (final_agg_df.TOD == 'evening')).toPandas()
weekend_overnight = final_agg_df.filter((final_agg_df.DOW == 'weekend') & (final_agg_df.TOD == 'overnight')).toPandas()
weekday_morning = final_agg_df.filter((final_agg_df.DOW == 'weekday') & (final_agg_df.TOD == 'morning')).toPandas()
weekday_evening = final_agg_df.filter((final_agg_df.DOW == 'weekday') & (final_agg_df.TOD == 'evening')).toPandas()
weekday_overnight = final_agg_df.filter((final_agg_df.DOW == 'weekday') & (final_agg_df.TOD == 'overnight')).toPandas()

In [130]:
weekend_morning

Unnamed: 0,DOW,TOD,complex_id,avg(entries),avg(exits)
0,weekend,morning,1,605.687500,438.312500
1,weekend,morning,10,641.250000,1170.625000
2,weekend,morning,100,107.250000,152.312500
3,weekend,morning,101,343.750000,721.937500
4,weekend,morning,103,98.187500,406.687500
...,...,...,...,...,...
422,weekend,morning,96,324.312500,136.062500
423,weekend,morning,97,500.062500,392.250000
424,weekend,morning,98,329.250000,350.687500
425,weekend,morning,99,231.312500,280.812500


In [132]:
weekday_morning['avg(entries)'].sum() + weekday_evening['avg(entries)'].sum() + weekday_overnight['avg(entries)'].sum()

3199313.0987147233

In [87]:
weekend_morning['avg(net_entries)'].sum() + weekend_evening['avg(net_entries)'].sum() + weekend_overnight['avg(net_entries)'].sum()

1054223.477223908

In [133]:
weekend_evening.loc[:425].to_csv('weekend_evening.csv')
weekend_morning.loc[:425].to_csv('weekend_morning.csv')
weekend_overnight.loc[:425].to_csv('weekend_overnight.csv')
weekday_morning.loc[:425].to_csv('weekday_morning.csv')
weekday_evening.loc[:425].to_csv('weekday_evening.csv')
weekday_orvernight.loc[:425].to_csv('weekday_overnight.csv')

In [None]:
turnstile_stations = list(bigdf.select(['STATION']).distinct().sort(F.col('STATION')).toPandas()['STATION'])
turnstile_stations

In [None]:
turnstile_station_unit= bigdf.select(['STATION', 'UNIT', 'DIVISION']).distinct().sort(F.col('STATION')).toPandas()
turnstile_station_unit.to_csv('turnstile_stations.csv')

In [None]:
stations_url = 'http://web.mta.info/developers/data/nyct/subway/Stations.csv'

stations = pd.read_csv(stations_url).sort_values('Stop Name')

#print(stations.groupby(['Complex ID','Stop Name','GTFS Stop ID','Division']).size().reset_index().rename(columns={0:'count'}))
stations['Stop Name'] =  stations['Stop Name'].str.upper()
#stations['Complex ID', 'GTFS Stop ID','Stop Name'].value_counts()

stations.to_csv('stations_list.csv')

     Complex ID                 Stop Name GTFS Stop ID Division  count
0             1      Astoria-Ditmars Blvd          R01      BMT      1
1             2              Astoria Blvd          R03      BMT      1
2             3                     30 Av          R04      BMT      1
3             4                  Broadway          R05      BMT      1
4             5                     36 Av          R06      BMT      1
..          ...                       ...          ...      ...    ...
491         630        Myrtle-Wyckoff Avs          M08      BMT      1
492         635               South Ferry          142      IRT      1
493         635  Whitehall St-South Ferry          R27      BMT      1
494         636          Jay St-MetroTech          A41      IND      1
495         636          Jay St-MetroTech          R29      BMT      1

[496 rows x 5 columns]


In [None]:
stations_list = stations.groupby(['Complex ID','Stop Name','GTFS Stop ID', 'Division']).size().reset_index().rename(columns={0:'count'})
stations_list.to_csv('stations_list.csv')

In [None]:
stop_name = list(stations[['Stop Name','GTFS Stop ID','Complex ID']].unique())
stop_name

In [None]:
turnstile_key_url = 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'

turnstile_key = pd.read_excel(turnstile_key_url)
pd.set_option('display.max_rows', turnstile_key.shape[0]+1)

turnstile_key.head()
print(turnstile_key.groupby(['Remote','Station']).size().reset_index().rename(columns={0:'count'}))

In [None]:

remote_complex_spark
#len(remote_complex['station'].unique())
#station_list = list(remote_complex['station'].unique())
#print(remote_complex.groupby(['complex_id','station']).size().reset_index().rename(columns={0:'count'}))

DataFrame[remote : string, booth: string, complex_id: string, station: string, line_name: string, division: string, unit_division: string]