In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [4]:
url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_170916.txt'
df = pd.read_csv(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198419 entries, 0 to 198418
Data columns (total 11 columns):
C/A                                                                     198419 non-null object
UNIT                                                                    198419 non-null object
SCP                                                                     198419 non-null object
STATION                                                                 198419 non-null object
LINENAME                                                                198419 non-null object
DIVISION                                                                198419 non-null object
DATE                                                                    198419 non-null object
TIME                                                                    198419 non-null object
DESC                                                                    198419 non-null object
ENTRIES                           

In [5]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/09/2017,00:00:00,REGULAR,6318862,2138544
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/09/2017,04:00:00,REGULAR,6318888,2138549
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/09/2017,08:00:00,REGULAR,6318905,2138584
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/09/2017,12:00:00,REGULAR,6318985,2138669
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/09/2017,16:00:00,REGULAR,6319212,2138731


In [6]:
## Create DATETIME column from 'DATE' and 'TIME' cols
df['DATETIME'] = df[['DATE','TIME']].apply(
    lambda x: datetime.datetime.strptime(
        '{0} {1}'.format(x[0],x[1]), 
        '%m/%d/%Y %H:%M:%S'), 
    axis=1)

In [7]:
## STANDARDIZE COL NAMES
df.columns = df.columns.str.strip()

## DROP UNNECESSARY COLUMNS
df_2 = df.drop(['LINENAME', 'DIVISION', 'DATE', 'TIME', 'DESC'], axis=1)

REGULAR       197939
RECOVR AUD       480
Name: DESC, dtype: int64

In [42]:
## create diff columns to take entries/exits for given time interval
df_2['entry_diff'] = df_2.sort_values(['STATION', 'C/A', 'UNIT', 'SCP','DATETIME']).groupby(['STATION', 'C/A', 'UNIT', 'SCP'])['ENTRIES'].diff()
df_2['exit_diff'] = df_2.sort_values(['STATION', 'C/A', 'UNIT', 'SCP','DATETIME']).groupby(['STATION', 'C/A', 'UNIT', 'SCP'])['EXITS'].diff()
df_2.head()

In [None]:
time deltas

In [43]:
df_2 = df_2.groupby(['STATION', 'C/A', 'UNIT', 'SCP',pd.Grouper(key='DATETIME', freq='4H')]).sum().reset_index()

In [44]:
## ASSUMPTIONS:
## 1. All negative actuals for entries and exits will be dropped from dataset
## 2. All outliers (+/- 1.5*IQR) for entries and exits will be dropped from dataset
len_entry_diffs = df_2['entry_diff'].shape[0]
len_exit_diffs = df_2['exit_diff'].shape[0]

negative_entries = df_2[df_2['entry_diff'] < 0].shape[0] / len_entry_diffs
negative_exits = df_2[df_2['exit_diff'] < 0].shape[0] / len_exit_diffs

print('negative_entries: ', negative_entries)
print('negative_exits: ', negative_exits)

## DROP NEGATIVES
df_2 = df_2[(df_2['entry_diff'] < 0) | (df_2['exit_diff'] < 0)]

negative_entries:  0.00803255918459056
negative_exits:  0.006832526502849439


In [58]:
entry_IQR = df_2['entry_diff'].quantile(0.75) - df_2['entry_diff'].quantile(0.25)
exit_IQR = df_2['exit_diff'].quantile(0.75) - df_2['exit_diff'].quantile(0.25)
cutoff_entry = entry_IQR*3
cutoff_exit = exit_IQR*3
print(cutoff_entry)
print(cutoff_exit)

1297.5
702.0


In [24]:
df_3 = df_2.groupby(['STATION', 'C/A', 'UNIT', 'SCP',pd.Grouper(key='DATETIME', freq='4H')]).sum().reset_index()

df_3.groupby('DATETIME').sum()


#times = [ x.time() for x in df_3['DATETIME'] ]
#for x in set(times): print(x)

#convert negatives to abs values
#df_3['entry_diff'] = df_3['entry_diff'].apply(lambda x: abs(x))
#df_3['exit_diff'] = df_3['exit_diff'].apply(lambda x: abs(x))

#check for negatives and non-integer values
#df_3[(df_3['entry_diff'] < 0) | (df_3['exit_diff'] < 0)]
#df_3[((df_3['entry_diff'] %1 != 0) & (df_3['entry_diff'].notnull())) | ((df_3['exit_diff'] %1 != 0) & (df_3['exit_diff'].notnull()))]

Unnamed: 0_level_0,ENTRIES,EXITS,entry_diff,exit_diff
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-09-09 00:00:00,183051397972,146933512082,,
2017-09-09 04:00:00,182899200980,146929854317,115600.0,125141.0
2017-09-09 08:00:00,183466108499,147514595136,-8109395.0,-765360.0
2017-09-09 12:00:00,184696743237,147755865617,327465.0,203883.0
2017-09-09 16:00:00,184956291332,147678185593,908339.0,694511.0
2017-09-09 20:00:00,184898972182,147753952148,886546.0,719875.0
2017-09-10 00:00:00,184900381310,147754611075,465159.0,414676.0
2017-09-10 04:00:00,184857360220,147720866698,112447.0,121408.0
2017-09-10 08:00:00,186441164039,149609106581,198599.0,135066.0
2017-09-10 12:00:00,201495655828,153534227475,542564.0,416166.0


In [None]:
df_3