In [1]:
#Required Packages
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta
import urllib.request
import matplotlib.pyplot as plt
import pickle

In [2]:
#Setup Configs
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

**Turnstile Data Import**

In [3]:
#Functions to retreive MTA turnstile data from http://web.mta.info/developers/turnstile.html

def get_next_weekday(startdate, weekday):
    """
    @startdate: given date, in format '2019-07-25'
    @weekday: week day as a integer, between 0 (Monday) to 6 (Sunday)
    """
    d = datetime.datetime.strptime(startdate, '%Y-%m-%d')
    t = timedelta((7 + weekday - d.weekday()) % 7)
    return (d + t).strftime('%Y-%m-%d')

def get_weekly_days(startdate,enddate):
    """
    @startdate: given date, in format '2019-07-25'. Must start on desired weekday
    @enddate: given date, in format '2019-07-25'. 
    """
    return pd.date_range(start=startdate, end=enddate, freq="7D")
    

def mta_import(startdate,enddate):
    """
    @startdate: given date, in format '2019-07-25'. Must start on desired weekday
    @enddate: given date, in format '2019-07-25'. 
    """
    base_url = ["http://web.mta.info/developers/data/nyct/turnstile/turnstile_",".txt"]
    date_range = get_weekly_days(get_next_weekday(startdate,5),enddate)
    df_list = []
    for day in date_range:
        try:
            df_list.append(pd.read_csv(day.strftime("%y%m%d").join(base_url)))
            print("Getting turnstile data for " + day.strftime("%Y-%m-%d"))
        except urllib.error.HTTPError as err:
            print("Failed to retreive turnstile data for " + day.strftime("%Y-%m-%d"))
    return pd.concat(df_list)

In [4]:
mta_data = mta_import("2019-04-01","2019-06-01").reset_index(drop=True)
unique_id = ['C/A','UNIT','SCP','STATION'] #unique turnstile reference used later on for grouping
mta_data.head(5)

Getting turnstile data for 2019-04-06
Getting turnstile data for 2019-04-13
Getting turnstile data for 2019-04-20
Getting turnstile data for 2019-04-27
Getting turnstile data for 2019-05-04
Getting turnstile data for 2019-05-11
Getting turnstile data for 2019-05-18
Getting turnstile data for 2019-05-25
Getting turnstile data for 2019-06-01


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/30/2019,00:00:00,REGULAR,6999064,2373568
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/30/2019,04:00:00,REGULAR,6999084,2373576
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/30/2019,08:00:00,REGULAR,6999107,2373622
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/30/2019,12:00:00,REGULAR,6999214,2373710
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/30/2019,16:00:00,REGULAR,6999451,2373781


In [5]:
#generating date and datetime values
mta_data['DATETIME'] = mta_data['DATE'] + ' ' + mta_data['TIME']
mta_data['DATETIME'] = pd.to_datetime(mta_data['DATETIME'], format = '%m/%d/%Y %H:%M:%S')
mta_data['DATE'] = mta_data['DATETIME'].dt.date
mta_data['TIME'] = mta_data['DATETIME'].dt.time

**Pre Transform Cleanup**

In [6]:
# Sanity check to verify that "C/A", "UNIT", "SCP", "STATION", "DATETIME" is unique
(mta_data
 .groupby(unique_id + ["DATETIME"])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
913339,N525,R142,01-00-03,DELANCEY/ESSEX,2019-05-11 05:00:00,2
362666,JFK03,R536,00-00-01,JFK JAMAICA CT1,2019-04-07 01:00:00,2
913721,N525,R142,01-00-04,DELANCEY/ESSEX,2019-05-11 05:00:00,2
1504257,R283,R221,00-00-01,167 ST,2019-04-15 12:00:00,2
1504642,R283,R221,00-00-02,167 ST,2019-04-16 04:00:00,2


In [7]:
mta_data.DESC.value_counts()

REGULAR       1832524
RECOVR AUD       8339
Name: DESC, dtype: int64

In [8]:
# Get rid of the duplicate entry
mta_data.sort_values(unique_id + ["DATETIME"], inplace=True, \
                          ascending=False)
mta_data.drop_duplicates(subset= unique_id + ["DATETIME"], inplace=True)

In [9]:
# Sanity check to verify that "C/A", "UNIT", "SCP", "STATION", "DATETIME" is unique
(mta_data
 .groupby(unique_id + ["DATETIME"])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-03-30 00:00:00,1
1227142,R142,R293,01-00-02,34 ST-PENN STA,2019-05-17 06:00:00,1
1227224,R142,R293,01-00-02,34 ST-PENN STA,2019-05-30 06:00:00,1
1227223,R142,R293,01-00-02,34 ST-PENN STA,2019-05-30 02:00:00,1
1227222,R142,R293,01-00-02,34 ST-PENN STA,2019-05-29 22:00:00,1


In [10]:
mta_data.columns = [column.strip() for column in mta_data.columns]
#Reordered columns and dropped "DESC" and "DIVISION" columns 
mta_data = mta_data[unique_id + ['LINENAME','DATETIME','DATE','TIME','ENTRIES','EXITS']]

In [12]:
#checking the number of time "buckets" in the dataset - we'd prefer to work with the same set of 6 times
(mta_data
 .groupby(['TIME'])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(10)

Unnamed: 0,TIME,ENTRIES
0,00:00:00,155512
8529,04:00:00,155487
35356,16:00:00,155475
17319,08:00:00,155463
26748,12:00:00,155420
44032,20:00:00,155352
19872,09:00:00,114250
37543,17:00:00,114184
10665,05:00:00,114156
46228,21:00:00,114145


In [13]:
#Rounding all datetimes to the nearest 0:00,4:00,8:00,12:00,16:00,20:00 - breaks on YoY changes
def datetime_round(dt):
    rounding = 4*round((float(dt.hour) + float(dt.minute)/60 + float(dt.second)/60/60) / 4)
    if rounding == 24: #when a time rounds up to 24 hours, we have to switch to midnight of the next day
        day = 1
        hour = 0
    else:
        day = 0
        hour = rounding
    try:
        return datetime.datetime(dt.year, dt.month, dt.day + day, hour)
    except ValueError:
        return datetime.datetime(dt.year, dt.month + 1, 1, hour)
    
mta_data_retime = mta_data.copy(deep=True)
    
mta_data_retime['DATETIME'] = mta_data_retime['DATETIME'].apply(datetime_round)
                                                  
mta_data_retime['DATE'] = mta_data_retime['DATETIME'].dt.date
mta_data_retime['TIME'] = mta_data_retime['DATETIME'].dt.time

In [14]:
#checking the evenness of time buckets - they should hopefully be even
(mta_data_retime
 .groupby(['TIME'])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(10)

Unnamed: 0,TIME,ENTRIES
2,08:00:00,325270
4,16:00:00,312065
0,00:00:00,309851
3,12:00:00,300219
5,20:00:00,297394
1,04:00:00,296026


In [15]:
# Checking to see if duplicate time entries were made (they were)
(mta_data_retime
 .groupby(unique_id + ["DATETIME"])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
1327379,R210,R044,00-03-01,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1327748,R210,R044,00-03-02,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1326267,R210,R044,00-00-00,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1328859,R210,R044,00-03-05,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1327009,R210,R044,00-03-00,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1329232,R210,R044,00-05-00,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1329610,R210,R044,00-05-01,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1330340,R210,R044,00-06-01,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1329966,R210,R044,00-06-00,BROOKLYN BRIDGE,2019-05-14 20:00:00,66
1328487,R210,R044,00-03-04,BROOKLYN BRIDGE,2019-05-14 20:00:00,66


In [16]:
#checking to see the problem days/turnstiles where many rows of the same time stamp are created
mask = ((mta_data["C/A"] == "R229") & 
        (mta_data["UNIT"] == "R143") & 
        (mta_data["SCP"] == "01-00-03") & 
        (mta_data["STATION"] == "28 ST") &
        (mta_data["DATE"] == datetime.date(2019,6,25)))

mta_data[mask].head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,ENTRIES,EXITS


In [17]:
#the working theory is that the majority of the repeats are from too many observations in the same 4hr period
#rather than incorrect bucketing - will confirm by checking time differences once column diffs are taken

#taking the maximum entry and exit for each turnstile/datetime combo

mta_data_max = (mta_data_retime
                .groupby(unique_id + ['LINENAME','DATETIME','DATE','TIME'])[['ENTRIES','EXITS']]  
                .max()
                .reset_index())
mta_data_max.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,6999064,2373568
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,6999084,2373576
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,6999107,2373622
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,6999214,2373710
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,6999451,2373781


In [18]:
#checking the evenness of time buckets - they should hopefully be even
(mta_data_max
 .groupby(['TIME'])  
 .ENTRIES.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES", ascending=False)).head(10)

Unnamed: 0,TIME,ENTRIES
2,08:00:00,302751
4,16:00:00,302591
0,00:00:00,302426
1,04:00:00,295525
3,12:00:00,295489
5,20:00:00,295198


In [19]:
mta_data_sorted = mta_data_max.sort_values(unique_id + ["DATETIME"], ascending = [True for _ in unique_id] + [True])
mta_data_sorted.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,6999064,2373568
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,6999084,2373576
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,6999107,2373622
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,6999214,2373710
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,6999451,2373781


**Calculating Per-Period Totals**

In [20]:
mta_data_sorted[["NEXT_DATETIME", "NEXT_ENTRIES","NEXT_EXITS"]] = (mta_data_sorted
                                                       .groupby(unique_id)["DATETIME", "ENTRIES","EXITS"]
                                                       .transform(lambda grp: grp.shift(-1)))
# transform() takes a function as parameter
# shift moves the index by the number of periods given (positive or negative)

mta_data_sorted.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,ENTRIES,EXITS,NEXT_DATETIME,NEXT_ENTRIES,NEXT_EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,6999064,2373568,2019-03-30 04:00:00,6999000.0,2374000.0
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,6999084,2373576,2019-03-30 08:00:00,6999000.0,2374000.0
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,6999107,2373622,2019-03-30 12:00:00,6999000.0,2374000.0
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,6999214,2373710,2019-03-30 16:00:00,6999000.0,2374000.0
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,6999451,2373781,2019-03-30 20:00:00,7000000.0,2374000.0


In [21]:
print(len(mta_data_sorted))
mta_data_sorted = mta_data_sorted.dropna().reset_index(drop=True)
print(len(mta_data_sorted))

1793980
1789088


In [22]:
mta_data_sorted['TIME_DELTA'] = mta_data_sorted['NEXT_DATETIME'] - mta_data_sorted['DATETIME']
mta_data_sorted['ENTRIES_DELTA'] = mta_data_sorted['NEXT_ENTRIES'] - mta_data_sorted['ENTRIES']
mta_data_sorted['EXITS_DELTA'] = mta_data_sorted['NEXT_EXITS'] - mta_data_sorted['EXITS']
mta_data_sorted.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,ENTRIES,EXITS,NEXT_DATETIME,NEXT_ENTRIES,NEXT_EXITS,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,6999064,2373568,2019-03-30 04:00:00,6999000.0,2374000.0,04:00:00,20.0,8.0
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,6999084,2373576,2019-03-30 08:00:00,6999000.0,2374000.0,04:00:00,23.0,46.0
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,6999107,2373622,2019-03-30 12:00:00,6999000.0,2374000.0,04:00:00,107.0,88.0
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,6999214,2373710,2019-03-30 16:00:00,6999000.0,2374000.0,04:00:00,237.0,71.0
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,6999451,2373781,2019-03-30 20:00:00,7000000.0,2374000.0,04:00:00,345.0,56.0


In [23]:
mta_data_delta = mta_data_sorted.drop(['ENTRIES','EXITS','NEXT_DATETIME','NEXT_ENTRIES','NEXT_EXITS'],axis=1)
mta_data_delta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,04:00:00,20.0,8.0
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,04:00:00,23.0,46.0
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,04:00:00,107.0,88.0
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,04:00:00,237.0,71.0
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,04:00:00,345.0,56.0


In [24]:
# let's check that the number of entries for today is higher than entries for yesterday
mta_data_delta[mta_data_delta["EXITS_DELTA"] < 0].head(50)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA
47399,A038,R085,00-00-01,8 ST-NYU,NRW,2019-04-22 08:00:00,2019-04-22,08:00:00,0 days 08:00:00,-3.584e+06,-1.246e+06
48937,A038,R085,00-06-00,8 ST-NYU,NRW,2019-04-28 08:00:00,2019-04-28,08:00:00,0 days 12:00:00,1.610e+05,-2.376e+04
49510,A039,R085,01-00-01,8 ST-NYU,NRW,2019-03-30 16:00:00,2019-03-30,16:00:00,0 days 08:00:00,-2.235e+06,-3.312e+06
54788,A042,R086,01-00-04,PRINCE ST,NRW,2019-04-01 16:00:00,2019-04-01,16:00:00,0 days 04:00:00,-1.116e+05,-3.247e+05
60584,A046,R463,00-00-03,CANAL ST,JNQRZ6W,2019-04-28 00:00:00,2019-04-28,00:00:00,0 days 04:00:00,-1.274e+05,-1.029e+04
69513,A047,R087,00-06-02,CITY HALL,NRW,2019-04-10 12:00:00,2019-04-10,12:00:00,1 days 04:00:00,-3.138e+04,-4.044e+04
101391,A069,R044,01-06-01,CHAMBERS ST,JZ456,2019-03-30 00:00:00,2019-03-30,00:00:00,0 days 04:00:00,1.200e+01,-1.100e+02
101392,A069,R044,01-06-01,CHAMBERS ST,JZ456,2019-03-30 04:00:00,2019-03-30,04:00:00,0 days 04:00:00,2.200e+01,-8.000e+01
101393,A069,R044,01-06-01,CHAMBERS ST,JZ456,2019-03-30 08:00:00,2019-03-30,08:00:00,0 days 04:00:00,3.100e+01,-8.670e+02
101394,A069,R044,01-06-01,CHAMBERS ST,JZ456,2019-03-30 12:00:00,2019-03-30,12:00:00,0 days 04:00:00,4.600e+01,-1.239e+03


In [25]:
print(len(mta_data_delta))
mta_data_delta = mta_data_delta.drop(mta_data_delta[mta_data_delta["ENTRIES_DELTA"] <= 0].index)
mta_data_delta = mta_data_delta.drop(mta_data_delta[mta_data_delta["EXITS_DELTA"] <= 0].index)
print(len(mta_data_delta))

1789088
1468091


In [26]:
mta_data_delta.sort_values(['ENTRIES_DELTA'],ascending=False).head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA
1772354,R729,R292,00-00-02,BAYCHESTER AV,5,2019-05-16 08:00:00,2019-05-16,08:00:00,04:00:00,2056000000.0,167100000.0
1540741,R405,R447,01-00-00,CYPRESS AV,6,2019-04-18 00:00:00,2019-04-18,00:00:00,12:00:00,2011000000.0,835100000.0
1444265,R258,R132,00-00-04,125 ST,456,2019-04-16 08:00:00,2019-04-16,08:00:00,04:00:00,1432000000.0,818900000.0
1780436,S101A,R070,01-00-03,ST. GEORGE,1,2019-05-02 08:00:00,2019-05-02,08:00:00,08:00:00,1127000000.0,1078000000.0
1362435,R231,R176,00-00-05,33 ST,6,2019-04-09 20:00:00,2019-04-09,20:00:00,08:00:00,1053000000.0,26280000.0
1528498,R332,R365,00-00-00,219 ST,25,2019-05-17 08:00:00,2019-05-17,08:00:00,04:00:00,902400000.0,317600000.0
1528080,R331,R364,00-05-01,GUN HILL RD,25,2019-05-10 12:00:00,2019-05-10,12:00:00,04:00:00,683300000.0,638300000.0
1442526,R257,R182,01-03-01,116 ST,6,2019-05-09 08:00:00,2019-05-09,08:00:00,04:00:00,635100000.0,584900000.0
1466525,R287,R244,00-05-00,BURNSIDE AV,4,2019-04-30 04:00:00,2019-04-30,04:00:00,08:00:00,606500000.0,1008000000.0
532979,N098,R028,00-02-00,FULTON ST,2345ACJZ,2019-05-24 04:00:00,2019-05-24,04:00:00,08:00:00,564500000.0,559300000.0


In [27]:
mta_data_delta.sort_values(['EXITS_DELTA'],ascending=False).head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA
1383039,R238,R046,00-06-00,GRD CNTRL-42 ST,4567S,2019-04-02 04:00:00,2019-04-02,04:00:00,12:00:00,310700000.0,1504000000.0
1780436,S101A,R070,01-00-03,ST. GEORGE,1,2019-05-02 08:00:00,2019-05-02,08:00:00,08:00:00,1127000000.0,1078000000.0
1466525,R287,R244,00-05-00,BURNSIDE AV,4,2019-04-30 04:00:00,2019-04-30,04:00:00,08:00:00,606500000.0,1008000000.0
1540741,R405,R447,01-00-00,CYPRESS AV,6,2019-04-18 00:00:00,2019-04-18,00:00:00,12:00:00,2011000000.0,835100000.0
1444265,R258,R132,00-00-04,125 ST,456,2019-04-16 08:00:00,2019-04-16,08:00:00,04:00:00,1432000000.0,818900000.0
1528080,R331,R364,00-05-01,GUN HILL RD,25,2019-05-10 12:00:00,2019-05-10,12:00:00,04:00:00,683300000.0,638300000.0
1442526,R257,R182,01-03-01,116 ST,6,2019-05-09 08:00:00,2019-05-09,08:00:00,04:00:00,635100000.0,584900000.0
532979,N098,R028,00-02-00,FULTON ST,2345ACJZ,2019-05-24 04:00:00,2019-05-24,04:00:00,08:00:00,564500000.0,559300000.0
655412,N220,R155,01-00-03,KINGSBRIDGE RD,BD,2019-04-15 04:00:00,2019-04-15,04:00:00,08:00:00,234000000.0,367700000.0
437805,N056,R188,01-00-01,50 ST,CE,2019-05-07 00:00:00,2019-05-07,00:00:00,12:00:00,331700000.0,332400000.0


In [28]:
print(len(mta_data_delta))
mta_data_delta = mta_data_delta.drop(mta_data_delta[mta_data_delta["ENTRIES_DELTA"] > 1000000].index)
mta_data_delta = mta_data_delta.drop(mta_data_delta[mta_data_delta["EXITS_DELTA"] > 1000000].index)
print(len(mta_data_delta))

1468091
1468050


In [29]:
(mta_data_delta
 .groupby(['TIME_DELTA'])  
 .ENTRIES_DELTA.count()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("ENTRIES_DELTA", ascending=False)).head(10)

Unnamed: 0,TIME_DELTA,ENTRIES_DELTA
0,0 days 04:00:00,1443776
1,0 days 08:00:00,23514
2,0 days 12:00:00,172
16,2 days 20:00:00,104
3,0 days 16:00:00,78
4,0 days 20:00:00,75
17,3 days 00:00:00,56
5,1 days 00:00:00,39
8,1 days 12:00:00,37
14,2 days 12:00:00,28


In [30]:
(mta_data_delta
 .groupby(unique_id + ['DATE'])  
 .TIME_DELTA.sum()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .sort_values("TIME_DELTA", ascending=False)).head(50)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TIME_DELTA
119836,N414A,R316,01-00-02,FLUSHING AV,2019-04-08,24 days 08:00:00
119765,N414A,R316,01-00-00,FLUSHING AV,2019-04-08,24 days 04:00:00
119801,N414A,R316,01-00-01,FLUSHING AV,2019-04-08,24 days 04:00:00
119905,N414A,R316,01-06-01,FLUSHING AV,2019-04-08,24 days 04:00:00
151754,PTH05,R543,00-01-07,EXCHANGE PLACE,2019-04-15,15 days 12:00:00
31781,C025,R215,00-03-00,86 ST,2019-04-16,14 days 12:00:00
151705,PTH05,R543,00-01-06,EXCHANGE PLACE,2019-04-15,12 days 04:00:00
127283,N508,R453,00-00-04,23 ST,2019-04-29,12 days 00:00:00
127235,N508,R453,00-00-03,23 ST,2019-04-29,12 days 00:00:00
127331,N508,R453,00-00-05,23 ST,2019-04-29,12 days 00:00:00


In [31]:
(mta_data_delta
 .groupby(unique_id + ['DATE'])  
 .TIME_DELTA.sum()
 .reset_index()  # or use as_index = False; otherwise makes groupby columns new index 
 .groupby(['TIME_DELTA'])
 .count())

Unnamed: 0_level_0,C/A,UNIT,SCP,STATION,DATE
TIME_DELTA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0 days 04:00:00,2692,2692,2692,2692,2692
0 days 08:00:00,3018,3018,3018,3018,3018
0 days 12:00:00,3361,3361,3361,3361,3361
0 days 16:00:00,6950,6950,6950,6950,6950
0 days 20:00:00,32888,32888,32888,32888,32888
1 days 00:00:00,212252,212252,212252,212252,212252
1 days 04:00:00,867,867,867,867,867
1 days 08:00:00,53,53,53,53,53
1 days 12:00:00,33,33,33,33,33
1 days 16:00:00,8,8,8,8,8


**Clean Outputs**

In [36]:
#Intraday Data
mta_data_intra = mta_data_delta.copy(deep=True)
print(len(mta_data_intra))
mta_data_intra = mta_data_intra.drop(mta_data_intra[mta_data_intra["TIME_DELTA"] != timedelta(hours=4)].index).reset_index(drop=True)
print(len(mta_data_intra))
mta_data_intra['TOTAL_DELTA'] = mta_data_intra['ENTRIES_DELTA'] + mta_data_intra['EXITS_DELTA']
mta_data_intra.head()
#TO DISCUSS: A few of the outputs above show that not all of the time deltas are four hours. What do we want to drop here?

1468050
1443776


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATETIME,DATE,TIME,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA,TOTAL_DELTA
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 00:00:00,2019-03-30,00:00:00,04:00:00,20.0,8.0,28.0
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 04:00:00,2019-03-30,04:00:00,04:00:00,23.0,46.0,69.0
2,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 08:00:00,2019-03-30,08:00:00,04:00:00,107.0,88.0,195.0
3,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 12:00:00,2019-03-30,12:00:00,04:00:00,237.0,71.0,308.0
4,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30 16:00:00,2019-03-30,16:00:00,04:00:00,345.0,56.0,401.0


In [33]:
#Daily Data


mta_data_daily = (mta_data_delta
                  .groupby(unique_id + ['LINENAME','DATE'])['ENTRIES_DELTA','EXITS_DELTA']
                  .sum()
                  .reset_index()
                 )

time_deltas = (mta_data_delta
                  .groupby(unique_id + ['LINENAME','DATE'])
                  .TIME_DELTA.sum()  # this sum had to be done separate from the entries and exits delta sum above
                  .reset_index()
                 )

mta_data_daily['TIME_DELTA'] = time_deltas['TIME_DELTA']

mta_data_daily = mta_data_daily[unique_id + ['LINENAME','DATE','TIME_DELTA','ENTRIES_DELTA','EXITS_DELTA']]

print(len(mta_data_daily))
mta_data_daily = mta_data_daily.drop(mta_data_daily[mta_data_daily["TIME_DELTA"] != timedelta(days=1)].index).reset_index(drop=True)
print(len(mta_data_daily))

mta_data_daily['TOTAL_DELTA'] = mta_data_daily['ENTRIES_DELTA'] + mta_data_daily['EXITS_DELTA']

mta_data_daily.head()


262496
212252


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DATE,TIME_DELTA,ENTRIES_DELTA,EXITS_DELTA,TOTAL_DELTA
0,A002,R051,02-00-00,59 ST,NQR456W,2019-03-30,1 days,893.0,299.0,1192.0
1,A002,R051,02-00-00,59 ST,NQR456W,2019-03-31,1 days,571.0,228.0,799.0
2,A002,R051,02-00-00,59 ST,NQR456W,2019-04-02,1 days,1593.0,554.0,2147.0
3,A002,R051,02-00-00,59 ST,NQR456W,2019-04-03,1 days,1652.0,424.0,2076.0
4,A002,R051,02-00-00,59 ST,NQR456W,2019-04-04,1 days,1638.0,511.0,2149.0


In [34]:
#pickle
with open('pickle/mta_data_intra.pickle', 'wb') as to_write:
    pickle.dump(mta_data_intra, to_write)
    
with open('pickle/mta_data_daily.pickle', 'wb') as to_write:
    pickle.dump(mta_data_daily, to_write)