<a href="https://colab.research.google.com/github/CUSPADS2022IBX/IBXRidership/blob/main/Turnstile%20Data%20Processing/MTA_turnstile_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Source: http://web.mta.info/developers/turnstile.html

Example:

The data below shows the entry/exit register values for one turnstile at control area (A002) from 09/27/14 at 00:00 hours to 09/29/14 at 00:00 hours


C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
A002,R051,02-00-00,LEXINGTON AVE,456NQR,BMT,09-27-14,00:00:00,REGULAR,0004800073,0001629137,


In [None]:
!pip install pyspark
!pip install --upgrade xlrd



In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark import SparkFiles

sc = pyspark.SparkContext.getOrCreate()
spark = SparkSession(sc)

start_date = date(2022, 1, 1)
end_date = date(2022, 4, 9)

#Create a list of dates for the date range requested
date_range = list(pd.date_range(start_date, end_date, freq='7D').strftime("%y%m%d"))

#MTA tunrstile schema, 'EXITS' kepts giving nulls when imported as IntegerType
mta_turnstile_schema = T.StructType([
  T.StructField('C/A', T.StringType(), True),
  T.StructField('UNIT', T.StringType(), True),
  T.StructField('SCP', T.StringType(), True),
  T.StructField('STATION', T.StringType(), True),
  T.StructField('LINENAME', T.StringType(), True),
  T.StructField('DIVISION', T.StringType(), True),
  T.StructField('DATE', T.StringType(), True),
  T.StructField('TIME', T.StringType(), True),
  T.StructField('DESC', T.StringType(), True),
  T.StructField('ENTRIES', T.IntegerType(), True),
  T.StructField('EXITS', T.FloatType(), True),
  ])

#Create empty dataframe with previous scheme
bigdf = spark.createDataFrame([], mta_turnstile_schema)

#Download each .txt file on to Spark job node and load into Spark DataFrame and union onto Empty DataFrame we created
for date_string in date_range:
  url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'.format(date_string)
  spark.sparkContext.addFile(url)
  df = spark.read.csv(SparkFiles.get('turnstile_{}.txt'.format(date_string)), mta_turnstile_schema, header=True)
  bigdf = bigdf.union(df)

#Change 'EXITS' column data type to IntegerType and concate 'DATE' and 'TIME' columns and cast to datetime
bigdf = bigdf.withColumn('EXITS',bigdf.EXITS.cast(T.IntegerType()))\
             .withColumn('timestamp',
                         F.unix_timestamp(F.concat(bigdf.DATE,bigdf.TIME),'MM/dd/yyyyHH:mm:ss').cast('timestamp'))

#Create columns to represent unique observation id, and unique turnstile id for data processing             
bigdf = bigdf.withColumn('id', F.concat(bigdf['C/A'],bigdf.UNIT,bigdf.SCP,bigdf['timestamp']))\
             .withColumn('unit_id', F.concat(bigdf['C/A'],bigdf.UNIT,bigdf.SCP))

#Use utility function window to partition by turnstile and order by timestamp
window = Window.partitionBy('unit_id').orderBy('timestamp')

#Use previous window to find the 'net_entries' and 'net_exits'. Remove all entries that are above 10000, because
#turnstiles act as odometers, and when turnstile reaches end it resets creating a large value. 10000 is a good cutoff.
#Also drop first rows of each turnstile data, because .lag function creates None for first row.
bigdf = bigdf.withColumn('net_entries', F.col('ENTRIES') - F.lag(F.col('ENTRIES'), 1).over(window))\
             .withColumn('net_exits', F.col('EXITS') - F.lag(F.col('EXITS'), 1).over(window))\
             .where((F.col('net_entries')<10000) |\
                    (F.col('net_exits')<10000) |\
                    (F.col('net_entries')!=None) |\
                    (F.col('net_exits')!=None))
             


In [None]:
bigdf.take(5)

[Row(C/A='A002', UNIT='R051', SCP='02-00-01', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='07:00:00', DESC='REGULAR', ENTRIES=6787892, EXITS=1564335, timestamp=datetime.datetime(2021, 12, 25, 7, 0), id='A002R05102-00-012021-12-25 07:00:00', unit_id='A002R05102-00-01', net_entries=1, net_exits=1),
 Row(C/A='A002', UNIT='R051', SCP='02-00-01', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='11:00:00', DESC='REGULAR', ENTRIES=6787895, EXITS=1564348, timestamp=datetime.datetime(2021, 12, 25, 11, 0), id='A002R05102-00-012021-12-25 11:00:00', unit_id='A002R05102-00-01', net_entries=3, net_exits=13),
 Row(C/A='A002', UNIT='R051', SCP='02-00-01', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='15:00:00', DESC='REGULAR', ENTRIES=6787910, EXITS=1564352, timestamp=datetime.datetime(2021, 12, 25, 15, 0), id='A002R05102-00-012021-12-25 15:00:00', unit_id='A002R05102-00-01', net_entries=15, net_exits=4),
 Ro

In [None]:
test_url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_220409.txt'

test = pd.read_csv(test_url)
test

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,00:00:00,REGULAR,7698997,2686961
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,04:00:00,REGULAR,7699002,2686968
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,08:00:00,REGULAR,7699008,2686998
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,12:00:00,REGULAR,7699024,2687085
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,16:00:00,REGULAR,7699062,2687148


In [None]:
test['timestamp'] = pd.to_datetime(test.DATE +' '+ test.TIME)
test['obs_id'] = test['C/A'] + test['UNIT'] + test['SCP'] + str(test['timestamp'])
test['turnstile_id'] = test['C/A'] + test['UNIT'] + test['SCP']

test['net_entries'] = test.sort_values(by=['timestamp'], ascending=True)\
                    .groupby('turnstile_id')['ENTRIES'].diff()

In [None]:
test.head(30)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,timestamp,obs_id,turntstile_id,turnstile_id,net_entries
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,00:00:00,REGULAR,7698997,2686961,2022-04-02 00:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,04:00:00,REGULAR,7699002,2686968,2022-04-02 04:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,5.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,08:00:00,REGULAR,7699008,2686998,2022-04-02 08:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,6.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,12:00:00,REGULAR,7699024,2687085,2022-04-02 12:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,16.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,16:00:00,REGULAR,7699062,2687148,2022-04-02 16:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,38.0
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/02/2022,20:00:00,REGULAR,7699127,2687203,2022-04-02 20:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,65.0
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/03/2022,00:00:00,REGULAR,7699148,2687216,2022-04-03 00:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,21.0
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/03/2022,04:00:00,REGULAR,7699154,2687224,2022-04-03 04:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,6.0
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/03/2022,08:00:00,REGULAR,7699159,2687237,2022-04-03 08:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,5.0
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/03/2022,12:00:00,REGULAR,7699172,2687304,2022-04-03 12:00:00,A002R05102-00-000 2022-04-02 00:00:00\n...,A002R05102-00-00,A002R05102-00-00,13.0


In [None]:
stations_url = 'http://web.mta.info/developers/data/nyct/subway/Stations.csv'

stations = pd.read_csv(stations_url)
stations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,ADA,ADA Direction Notes,ADA NB,ADA SB,Capital Outage NB,Capital Outage SB
0,1,1,R01,BMT,Astoria,Astoria-Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan,0,,,,,
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan,1,,,,,
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan,0,,,,,
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan,0,,,,,
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan,0,,,,,


In [None]:
turnstile_key_url = 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'

turnstile_key = pd.read_excel(turnstile_key_url)
turnstile_key.head()

Unnamed: 0,Remote,Booth,Station,Line Name,Division
0,R001,A060,WHITEHALL ST,R1,BMT
1,R001,A058,WHITEHALL ST,R1,BMT
2,R001,R101S,SOUTH FERRY,R1,IRT
3,R002,A077,FULTON ST,ACJZ2345,BMT
4,R002,A081,FULTON ST,ACJZ2345,BMT
