Load in data from turnstile file

In [1]:
"""
Set Options
"""

# import libraries
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import matplotlib
from datetime import datetime
import pickle as pkl

# configuration options
%matplotlib inline
matplotlib.style.use("seaborn-muted")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [2]:
"""
load, parse, and initially sample data
"""
# load and concatanate sample data
path = r'/Users/tbowling/ds/metis/working/Benson/data/'
all_files = glob.glob(path + "/*.txt")
df = pd.concat((pd.read_csv(f) for f in all_files))

# convert weird style of control area name
df.rename(index=str, columns={"C/A":"CONTROL"},inplace=True)

# drop spurious data
df = df.drop(['LINENAME','DIVISION'], axis=1)

# filter out audits
df = df[df.DESC == 'REGULAR']

# strip spurious whitespace from column names
df.columns = [column.strip() for column in df.columns]

# convert time data into datetime objects
df['TIMING'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'],format = '%m/%d/%Y %H:%M:%S' )

In [3]:
"""
Convert data to daily flux vs turnstile for entries
"""

# sort values
df.sort_values(["CONTROL", "UNIT", "SCP", "STATION", "TIMING"], inplace=True, \
                          ascending=False)

# get first each day
daily_entries = df.groupby(["CONTROL", "UNIT", "SCP", "STATION", "DATE"])\
.ENTRIES.first().reset_index()

# make columns for previous day's data
daily_entries[["PREV_DATE", "PREV_ENTRIES"]] = (daily_entries
                                            .groupby(["CONTROL", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                            .transform(lambda grp: grp.shift(1)))
# drop first column
daily_entries.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

# deal with negative data - from Lara's example
def get_daily_counts_entries(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        #print(row["ENTRIES"], row["PREV_ENTRIES"])
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
        # if current entries is bad, use yesterday's count as proxy
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
daily_entries["ENTRY_FLUX"] = daily_entries.apply(get_daily_counts_entries, axis=1, max_counter=1000000)

In [4]:
"""
Convert data to daily flux vs turnstile for entries
"""

# get first each day
daily_exits = df.groupby(["CONTROL", "UNIT", "SCP", "STATION", "DATE"])\
.EXITS.first().reset_index()

# make columns for previous day's data
daily_exits[["PREV_DATE", "PREV_EXITS"]] = (daily_exits
                                            .groupby(["CONTROL", "UNIT", "SCP", "STATION"])["DATE", "EXITS"]
                                            .transform(lambda grp: grp.shift(1)))
# drop first column
daily_exits.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

# deal with negative data - from Lara's example
def get_daily_counts_exits(row, max_counter):
    counter = row["EXITS"] - row["PREV_EXITS"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        #print(row["EXITS"], row["PREV_EXITS"])
        counter = min(row["EXITS"], row["PREV_EXITS"])
        # if current entries is bad, use yesterday's count as proxy
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits
daily_exits["EXIT_FLUX"] = daily_exits.apply(get_daily_counts_exits, axis=1, max_counter=1000000)

In [5]:
"""
merge entries and exits into single dataframe
"""

# drop duplicate data
daily_entries.drop(['ENTRIES','PREV_ENTRIES','PREV_DATE'], axis=1, inplace=True)
daily_exits.drop(['EXITS','PREV_EXITS','PREV_DATE'], axis=1, inplace=True)

flux = pd.merge(daily_entries,daily_exits,on=['CONTROL','UNIT','SCP','STATION','DATE'])

In [6]:
"""
reconvert date into datetime object
"""

# convert time data into datetime objects
flux['TIMING'] = pd.to_datetime(flux['DATE'],format = '%m/%d/%Y')
flux.drop(['DATE'],axis=1,inplace=True)
flux.rename(index=str, columns={"TIMING":"DATE"},inplace=True)

In [7]:
"""
pickle output
"""

# write as binary ('wb')
with open('./data/turnstile_data.pkl', 'wb') as picklefile:
    pkl.dump(flux, picklefile)