<a href="https://colab.research.google.com/github/CUSPADS2022IBX/IBXRidership/blob/main/Turnstile%20Data%20Processing/MTA_turnstile_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Source: http://web.mta.info/developers/turnstile.html

Example:

The data below shows the entry/exit register values for one turnstile at control area (A002) from 09/27/14 at 00:00 hours to 09/29/14 at 00:00 hours


C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
A002,R051,02-00-00,LEXINGTON AVE,456NQR,BMT,09-27-14,00:00:00,REGULAR,0004800073,0001629137,


In [1]:
!pip install pyspark
!pip install --upgrade xlrd

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 42 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 75.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=1dad244d832e05baea4f29c07c2e84f35b40a8cae8fafad47415d34e0663d2b7
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark import SparkFiles

sc = pyspark.SparkContext.getOrCreate()
spark = SparkSession(sc)
spark

In [10]:
start_date = date(2022, 1, 1)
end_date = date(2022, 4, 9)

date_range = list(pd.date_range(start_date, end_date, freq='7D').strftime("%y%m%d"))
date_range

['220101',
 '220108',
 '220115',
 '220122',
 '220129',
 '220205',
 '220212',
 '220219',
 '220226',
 '220305',
 '220312',
 '220319',
 '220326',
 '220402',
 '220409']

In [11]:
mta_turnstile_schema = T.StructType([
  T.StructField('C/A', T.StringType(), True),
  T.StructField('UNIT', T.StringType(), True),
  T.StructField('SCP', T.StringType(), True),
  T.StructField('STATION', T.StringType(), True),
  T.StructField('LINENAME', T.StringType(), True),
  T.StructField('DIVISION', T.StringType(), True),
  T.StructField('DATE', T.StringType(), True),
  T.StructField('TIME', T.StringType(), True),
  T.StructField('DESC', T.StringType(), True),
  T.StructField('ENTRIES', T.IntegerType(), True),
  T.StructField('EXITS', T.FloatType(), True),
  ])

bigdf = spark.createDataFrame([], mta_turnstile_schema)


for date in date_range:
  url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'.format(date)
  spark.sparkContext.addFile(url)
  df = spark.read.csv(SparkFiles.get('turnstile_{}.txt'.format(date)), mta_turnstile_schema, header=True)
  bigdf = bigdf.union(df)

In [28]:
bigdf.take(5)

[Row(C/A='A002', UNIT='R051', SCP='02-00-00', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='03:00:00', DESC='REGULAR', ENTRIES=7674633, EXITS=2647624.0),
 Row(C/A='A002', UNIT='R051', SCP='02-00-00', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='07:00:00', DESC='REGULAR', ENTRIES=7674636, EXITS=2647627.0),
 Row(C/A='A002', UNIT='R051', SCP='02-00-00', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='11:00:00', DESC='REGULAR', ENTRIES=7674641, EXITS=2647646.0),
 Row(C/A='A002', UNIT='R051', SCP='02-00-00', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='15:00:00', DESC='REGULAR', ENTRIES=7674654, EXITS=2647662.0),
 Row(C/A='A002', UNIT='R051', SCP='02-00-00', STATION='59 ST', LINENAME='NQR456W', DIVISION='BMT', DATE='12/25/2021', TIME='19:00:00', DESC='REGULAR', ENTRIES=7674684, EXITS=2647689.0)]

In [31]:
bigdf.rdd

MapPartitionsRDD[185] at javaToPython at NativeMethodAccessorImpl.java:0

In [25]:
stations = bigdf.select('UNIT').distinct()

In [27]:
stations.count()

469

In [None]:
!wget 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'

In [3]:
station_key = pd.read_excel('Remote-Booth-Station.xls')

In [6]:
station_key.shape

(768, 5)

In [14]:
bigdf[bigdf['STATION']=='Jay Str.-Metrotech'].show()

+---+----+---+-------+--------+--------+----+----+----+-------+-----+
|C/A|UNIT|SCP|STATION|LINENAME|DIVISION|DATE|TIME|DESC|ENTRIES|EXITS|
+---+----+---+-------+--------+--------+----+----+----+-------+-----+
+---+----+---+-------+--------+--------+----+----+----+-------+-----+

