In [1]:
# Importing packages
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from datetime import datetime, timedelta
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Accessing 2019 data from database
engine = create_engine('sqlite:///mta_2019.db')
df = pd.read_sql('SELECT * FROM mta_2019;',engine)

In [3]:

df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS ...
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,03:00:00,REGULAR,0006897012,0002338472 ...
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,07:00:00,REGULAR,0006897023,0002338487 ...
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,11:00:00,REGULAR,0006897083,0002338565 ...
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,15:00:00,REGULAR,0006897262,0002338624 ...


In [4]:
#set first line as header

new_header = df.iloc[0] 
df = df[1:] 
df.columns = new_header 

#reset index
df.reset_index(drop=True)


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,03:00:00,REGULAR,0006897012,0002338472 ...
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,07:00:00,REGULAR,0006897023,0002338487 ...
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,11:00:00,REGULAR,0006897083,0002338565 ...
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,15:00:00,REGULAR,0006897262,0002338624 ...
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,19:00:00,REGULAR,0006897572,0002338679 ...
...,...,...,...,...,...,...,...,...,...,...,...
10470337,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,12/27/2019,04:00:00,REGULAR,5554,420
10470338,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,12/27/2019,08:00:00,REGULAR,5554,420
10470339,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,12/27/2019,12:00:00,REGULAR,5554,420
10470340,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,12/27/2019,16:00:00,REGULAR,5554,420


In [5]:
# Rename columns to remove whitespace
df.columns = df.columns.str.replace(" ", "")

In [6]:
# Create DATETIME column from 'DATE' and 'TIME' columns
df['DATETIME'] = (df['DATE'] + " " + df['TIME'])
df["DATETIME"] = pd.to_datetime(df["DATETIME"], errors='coerce', format='%m/%d/%Y %H:%M:%S')
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,03:00:00,REGULAR,6897012,0002338472 ...,2019-01-05 03:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,07:00:00,REGULAR,6897023,0002338487 ...,2019-01-05 07:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,11:00:00,REGULAR,6897083,0002338565 ...,2019-01-05 11:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,15:00:00,REGULAR,6897262,0002338624 ...,2019-01-05 15:00:00
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/05/2019,19:00:00,REGULAR,6897572,0002338679 ...,2019-01-05 19:00:00


In [7]:
#get date per row 
df["DATE"] = df["DATETIME"].dt.date

#get week per row
df["WEEK"] = df["DATETIME"].dt.isocalendar().week

df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,03:00:00,REGULAR,6897012,0002338472 ...,2019-01-05 03:00:00,1
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,0002338487 ...,2019-01-05 07:00:00,1
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,0002338565 ...,2019-01-05 11:00:00,1
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,0002338624 ...,2019-01-05 15:00:00,1
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,0002338679 ...,2019-01-05 19:00:00,1


In [8]:
df.dtypes

0
C/A                 object
UNIT                object
SCP                 object
STATION             object
LINENAME            object
DIVISION            object
DATE                object
TIME                object
DESC                object
ENTRIES             object
EXITS               object
DATETIME    datetime64[ns]
WEEK                UInt32
dtype: object

In [9]:
#convert entries and exits to int64
df["ENTRIES"] = pd.to_numeric(df["ENTRIES"], errors='coerce').convert_dtypes() 
df["EXITS"] = pd.to_numeric(df["EXITS"], errors='coerce').convert_dtypes() 
df.dtypes

0
C/A                 object
UNIT                object
SCP                 object
STATION             object
LINENAME            object
DIVISION            object
DATE                object
TIME                object
DESC                object
ENTRIES              Int64
EXITS                Int64
DATETIME    datetime64[ns]
WEEK                UInt32
dtype: object

In [10]:
df.DESC.unique()

array(['REGULAR', 'RECOVR AUD', 'DESC'], dtype=object)

In [11]:
# Remove non-REGULAR values from 'DESC'
df = df.drop(df.loc[df.DESC != 'REGULAR'].index)
df.DESC.unique()

array(['REGULAR'], dtype=object)

In [12]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,03:00:00,REGULAR,6897012,2338472,2019-01-05 03:00:00,1
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,2338487,2019-01-05 07:00:00,1
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,2338565,2019-01-05 11:00:00,1
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,2338624,2019-01-05 15:00:00,1
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,2338679,2019-01-05 19:00:00,1


In [13]:
#clean station names 
df["STATION"] = df["STATION"].str.replace(' ', '_')
df["STATION"] = df["STATION"].str.replace('-', '_')

df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK
1,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,03:00:00,REGULAR,6897012,2338472,2019-01-05 03:00:00,1
2,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,2338487,2019-01-05 07:00:00,1
3,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,2338565,2019-01-05 11:00:00,1
4,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,2338624,2019-01-05 15:00:00,1
5,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,2338679,2019-01-05 19:00:00,1


In [14]:
#identify duplicate stations
lines = df.groupby(["STATION","LINENAME"]).first().reset_index()
lines.head()

Unnamed: 0,STATION,LINENAME,C/A,UNIT,SCP,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK
0,103_ST,1,R170,R191,00-00-00,IRT,2019-01-05,00:00:00,REGULAR,1650805,662882,2019-01-05 00:00:00,1
1,103_ST,6,R252,R180,00-00-00,IRT,2019-01-05,00:00:00,REGULAR,36511390,505568831,2019-01-05 00:00:00,1
2,103_ST,BC,N037,R314,00-00-00,IND,2019-01-05,00:00:00,REGULAR,13867654,11191407,2019-01-05 00:00:00,1
3,103_ST_CORONA,7,R529,R208,00-00-00,IRT,2019-01-05,03:00:00,REGULAR,15381455,24226957,2019-01-05 03:00:00,1
4,104_ST,A,N137,R354,00-00-00,IND,2019-01-05,03:00:00,REGULAR,151836,97952,2019-01-05 03:00:00,1


In [15]:
#create station + lines columns
df["STATION_FULL"] = df["STATION"] + "_Line_" + df["LINENAME"]
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK,STATION_FULL
1,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,03:00:00,REGULAR,6897012,2338472,2019-01-05 03:00:00,1,59_ST_Line_NQR456W
2,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,2338487,2019-01-05 07:00:00,1,59_ST_Line_NQR456W
3,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,2338565,2019-01-05 11:00:00,1,59_ST_Line_NQR456W
4,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,2338624,2019-01-05 15:00:00,1,59_ST_Line_NQR456W
5,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,2338679,2019-01-05 19:00:00,1,59_ST_Line_NQR456W


In [16]:
# Create previous datetime, entries and exits columns

df[["PREV_DATETIME", "PREV_ENTRIES", "PREV_EXITS"]] = (df
                                                .groupby(['STATION_FULL'])
                                                       ['DATETIME','ENTRIES','EXITS']
                                                .apply(lambda grp: grp.shift(1)))

  df[["PREV_DATETIME", "PREV_ENTRIES", "PREV_EXITS"]] = (df


In [17]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK,STATION_FULL,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
1,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,03:00:00,REGULAR,6897012,2338472,2019-01-05 03:00:00,1,59_ST_Line_NQR456W,NaT,,
2,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,2338487,2019-01-05 07:00:00,1,59_ST_Line_NQR456W,2019-01-05 03:00:00,6897012.0,2338472.0
3,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,2338565,2019-01-05 11:00:00,1,59_ST_Line_NQR456W,2019-01-05 07:00:00,6897023.0,2338487.0
4,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,2338624,2019-01-05 15:00:00,1,59_ST_Line_NQR456W,2019-01-05 11:00:00,6897083.0,2338565.0
5,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,2338679,2019-01-05 19:00:00,1,59_ST_Line_NQR456W,2019-01-05 15:00:00,6897262.0,2338624.0


In [18]:
# Drop Na rows
print("Before drop NA: ", df.shape)
df.dropna(how='any', inplace=True)
print("After drop NA: ", df.shape)

Before drop NA:  (10423883, 17)
After drop NA:  (10423404, 17)


In [19]:
#count the actual number of entries and exits from cumulative numbers 
df["ENTRY_COUNT"] = abs(df["ENTRIES"] - df["PREV_ENTRIES"])
df["EXIT_COUNT"] = abs(df["EXITS"] - df["PREV_EXITS"])

In [20]:
#clean entry count for values in excess of 14400, equal to one person per sec in 4 hour interval

index_entry = df[ df['ENTRY_COUNT'] > 14400 ].index
df.drop(index_entry , inplace=True)

In [21]:
#clean exit count for values in excess of 14400, equal to one person per sec in 4 hours interval 

index_exit = df[df['EXIT_COUNT'] > 14400].index
df.drop(index_exit, inplace = True)

In [22]:
#sanity check
df.EXIT_COUNT.describe()

count    1.018562e+07
mean     1.390455e+02
std      2.719199e+02
min      0.000000e+00
25%      8.000000e+00
50%      5.100000e+01
75%      1.640000e+02
max      1.438900e+04
Name: EXIT_COUNT, dtype: float64

In [23]:
#sanity check
df.ENTRY_COUNT.describe()

count    1.018562e+07
mean     1.751737e+02
std      3.044214e+02
min      0.000000e+00
25%      1.000000e+01
50%      7.200000e+01
75%      2.330000e+02
max      1.440000e+04
Name: ENTRY_COUNT, dtype: float64

In [24]:
#get traffic per row
df["TRAFFIC"] = df["ENTRY_COUNT"] + df["EXIT_COUNT"]


In [25]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK,STATION_FULL,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,ENTRY_COUNT,EXIT_COUNT,TRAFFIC
2,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,07:00:00,REGULAR,6897023,2338487,2019-01-05 07:00:00,1,59_ST_Line_NQR456W,2019-01-05 03:00:00,6897012,2338472,11,15,26
3,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,11:00:00,REGULAR,6897083,2338565,2019-01-05 11:00:00,1,59_ST_Line_NQR456W,2019-01-05 07:00:00,6897023,2338487,60,78,138
4,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,15:00:00,REGULAR,6897262,2338624,2019-01-05 15:00:00,1,59_ST_Line_NQR456W,2019-01-05 11:00:00,6897083,2338565,179,59,238
5,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,19:00:00,REGULAR,6897572,2338679,2019-01-05 19:00:00,1,59_ST_Line_NQR456W,2019-01-05 15:00:00,6897262,2338624,310,55,365
6,A002,R051,02-00-00,59_ST,NQR456W,BMT,2019-01-05,23:00:00,REGULAR,6897740,2338703,2019-01-05 23:00:00,1,59_ST_Line_NQR456W,2019-01-05 19:00:00,6897572,2338679,168,24,192


In [26]:
#sanity check
df.TRAFFIC.describe()

count    1.018562e+07
mean     3.142192e+02
std      4.775943e+02
min      0.000000e+00
25%      3.000000e+01
50%      1.660000e+02
75%      4.450000e+02
max      2.847100e+04
Name: TRAFFIC, dtype: float64

In [27]:
# check for nan
print("Before drop NA: ", df.shape)
df.dropna(how='any', inplace=True)
print("After drop NA: ", df.shape)

Before drop NA:  (10185624, 20)
After drop NA:  (10185624, 20)


In [28]:
#identify duplicate stations
df.groupby(["STATION_FULL"]).first()

Unnamed: 0_level_0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,ENTRY_COUNT,EXIT_COUNT,TRAFFIC
STATION_FULL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
103_ST_CORONA_Line_7,R529,R208,00-00-00,103_ST_CORONA,7,IRT,2019-01-05,07:00:00,REGULAR,15381600,24227079,2019-01-05 07:00:00,1,2019-01-05 03:00:00,15381455,24226957,145,122,267
103_ST_Line_1,R170,R191,00-00-00,103_ST,1,IRT,2019-01-05,04:00:00,REGULAR,1650840,662929,2019-01-05 04:00:00,1,2019-01-05 00:00:00,1650805,662882,35,47,82
103_ST_Line_6,R252,R180,00-00-00,103_ST,6,IRT,2019-01-05,04:00:00,REGULAR,36511419,505568907,2019-01-05 04:00:00,1,2019-01-05 00:00:00,36511390,505568831,29,76,105
103_ST_Line_BC,N037,R314,00-00-00,103_ST,BC,IND,2019-01-05,04:00:00,REGULAR,13867676,11191455,2019-01-05 04:00:00,1,2019-01-05 00:00:00,13867654,11191407,22,48,70
104_ST_Line_A,N137,R354,00-00-00,104_ST,A,IND,2019-01-05,07:00:00,REGULAR,151888,97954,2019-01-05 07:00:00,1,2019-01-05 03:00:00,151836,97952,52,2,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WTC_CORTLANDT_Line_1,R106,R305,01-00-00,WTC_CORTLANDT,1,IRT,2019-01-05,04:00:00,REGULAR,10,0,2019-01-05 04:00:00,1,2019-01-05 00:00:00,10,0,0,0,0
W_4_ST_WASH_SQ_Line_ABCDEFM,N080,R138,00-00-00,W_4_ST_WASH_SQ,ABCDEFM,IND,2019-01-05,04:00:00,REGULAR,1573310,4325953,2019-01-05 04:00:00,1,2019-01-05 00:00:00,1573174,4325798,136,155,291
W_8_ST_AQUARIUM_Line_FQ,G011,R312,00-00-00,W_8_ST_AQUARIUM,FQ,BMT,2019-01-05,07:00:00,REGULAR,4326996,9160993,2019-01-05 07:00:00,1,2019-01-05 03:00:00,4326995,9160991,1,2,3
YORK_ST_Line_F,N530,R301,00-00-00,YORK_ST,F,IND,2019-01-05,04:00:00,REGULAR,14666421,25075866,2019-01-05 04:00:00,1,2019-01-05 00:00:00,14666360,25075847,61,19,80


In [29]:
#get traffic per date per station
df_daily = df.groupby(["STATION_FULL", "DATE"]).sum("TRAFFIC").reset_index()
df_daily.head()

Unnamed: 0,STATION_FULL,DATE,ENTRIES,EXITS,WEEK,PREV_ENTRIES,PREV_EXITS,ENTRY_COUNT,EXIT_COUNT,TRAFFIC
0,103_ST_CORONA_Line_7,2019-01-05,345125952,350075517,45,345113228,350066460,12724,9057,21781
1,103_ST_CORONA_Line_7,2019-01-06,414211683,420148190,54,414201104,420138787,10579,9403,19982
2,103_ST_CORONA_Line_7,2019-01-07,345258503,350180165,90,345239177,350165902,19326,14263,33589
3,103_ST_CORONA_Line_7,2019-01-08,414428242,420298131,108,414408653,420283950,19589,14181,33770
4,103_ST_CORONA_Line_7,2019-01-09,414546382,420383900,108,414526628,420369363,19754,14537,34291


In [30]:
# Drop unnecessary columns from daily df 
df_daily = df_daily.drop(["ENTRIES", "EXITS", "PREV_ENTRIES", "PREV_EXITS", "ENTRY_COUNT", "EXIT_COUNT","WEEK"], axis=1)
df_daily.head()

Unnamed: 0,STATION_FULL,DATE,TRAFFIC
0,103_ST_CORONA_Line_7,2019-01-05,21781
1,103_ST_CORONA_Line_7,2019-01-06,19982
2,103_ST_CORONA_Line_7,2019-01-07,33589
3,103_ST_CORONA_Line_7,2019-01-08,33770
4,103_ST_CORONA_Line_7,2019-01-09,34291


In [31]:
#make sure the last week is complete 
df_daily.DATE.tail(10)
#week 52 is december 23-29,2019 therefore week 52 data is incomplete 
#week 52 will be dropped from analysis for weekly traffic 

169970    2019-12-18
169971    2019-12-19
169972    2019-12-20
169973    2019-12-21
169974    2019-12-22
169975    2019-12-23
169976    2019-12-24
169977    2019-12-25
169978    2019-12-26
169979    2019-12-27
Name: DATE, dtype: object

In [32]:
#make sure week 1 is complete 
df_daily.DATE.head(10)
#week 1 is 12/31/2018 - 1/6/2019 therefore 
#week 1 data is incomplete and will be dropped from analysis for
#weekly traffic

0    2019-01-05
1    2019-01-06
2    2019-01-07
3    2019-01-08
4    2019-01-09
5    2019-01-10
6    2019-01-11
7    2019-01-12
8    2019-01-13
9    2019-01-14
Name: DATE, dtype: object

In [33]:
#get traffic per week per station
df_weekly = df.groupby(["STATION_FULL", "WEEK"]).sum("TRAFFIC").reset_index()
df_weekly.head()

Unnamed: 0,STATION_FULL,WEEK,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,ENTRY_COUNT,EXIT_COUNT,TRAFFIC
0,103_ST_CORONA_Line_7,1,759337635,770223707,759314332,770205247,23303,18460,41763
1,103_ST_CORONA_Line_7,2,2764353592,2803114524,2764232486,2803025366,121106,89158,210264
2,103_ST_CORONA_Line_7,3,3115275438,3157485761,3115154646,3157398445,120792,87316,208108
3,103_ST_CORONA_Line_7,4,2842975100,2880256141,2842864573,2880174967,110527,81174,191701
4,103_ST_CORONA_Line_7,5,2847807585,2883863851,2847690218,2883776363,117367,87488,204855


In [34]:
#remove week 1 data, as it is incomplete
df_weekly = df_weekly.drop(df_weekly.loc[df_weekly.WEEK == 1 ].index)
df_weekly.WEEK.unique()

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52])

In [35]:
#remove week 31-52 data, as 52 is incomplete and we only use up to week 30 in 2021 data
no_weeks=list(range(31,53))
df_weekly = df_weekly.drop(df_weekly.loc[df_weekly.WEEK > 30 ].index)
df_weekly.WEEK.unique()


array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])

In [36]:
# Drop unnecessary columns from weekly df 
df_weekly = df_weekly.drop(["ENTRIES", "EXITS", "PREV_ENTRIES", "PREV_EXITS", "ENTRY_COUNT", "EXIT_COUNT"], axis=1)
df_weekly.head()

Unnamed: 0,STATION_FULL,WEEK,TRAFFIC
1,103_ST_CORONA_Line_7,2,210264
2,103_ST_CORONA_Line_7,3,208108
3,103_ST_CORONA_Line_7,4,191701
4,103_ST_CORONA_Line_7,5,204855
5,103_ST_CORONA_Line_7,6,215747


In [37]:
df_weekly.shape


(13830, 3)

In [38]:
df_weekly.dtypes

0
STATION_FULL    object
WEEK             int64
TRAFFIC          Int64
dtype: object

In [39]:
#save data for 2019 for further use 
df_weekly.to_csv("./all_traffic_2019.csv", index=False)

In [40]:
#get average traffic over all stations for 2019

df_2019 = df_weekly.groupby("WEEK").mean("TRAFFIC").reset_index()
df_2019["STATION"] = "AVERAGE"
df_2019.head()

Unnamed: 0,WEEK,TRAFFIC,STATION
0,2,125855.382353,AVERAGE
1,3,125422.191176,AVERAGE
2,4,114158.398323,AVERAGE
3,5,124484.943396,AVERAGE
4,6,131055.735849,AVERAGE


In [41]:
#load top ten from 2021 
topten_2021 = pd.read_csv("./top_traffic_2021.csv")
topten_2021.head()


Unnamed: 0,STATION_FULL,WEEK,TRAFFIC
0,14_ST_UNION_SQ_Line_LNQR456W,1,175345
1,14_ST_UNION_SQ_Line_LNQR456W,2,177299
2,14_ST_UNION_SQ_Line_LNQR456W,3,171502
3,14_ST_UNION_SQ_Line_LNQR456W,4,170188
4,14_ST_UNION_SQ_Line_LNQR456W,5,152808


In [42]:
#get list of top_ten_2021
tt_2021_list=topten_2021["STATION_FULL"].unique()
tt_2021_list

array(['14_ST_UNION_SQ_Line_LNQR456W', '34_ST_HERALD_SQ_Line_BDFMNQRW',
       '34_ST_PENN_STA_Line_ACE', '42_ST_PORT_AUTH_Line_ACENQRS1237W',
       '86_ST_Line_456', 'FLUSHING_MAIN_Line_7',
       'FULTON_ST_Line_2345ACJZ', 'GRD_CNTRL_42_ST_Line_4567S',
       'JKSN_HT_ROOSVLT_Line_EFMR7', 'PATH_NEW_WTC_Line_1'], dtype=object)

In [43]:
#create filter for top stations of 2021
top2021_filter = df_weekly["STATION_FULL"].isin(tt_2021_list)
top2021_filter

1        False
2        False
3        False
4        False
5        False
         ...  
24766    False
24767    False
24768    False
24769    False
24770    False
Name: STATION_FULL, Length: 13830, dtype: bool

In [44]:
#get df of traffic data per week for top stations 
df_top_traffic_2021 = df_weekly[top2021_filter]
df_top_traffic_2021.head()

Unnamed: 0,STATION_FULL,WEEK,TRAFFIC
1645,14_ST_UNION_SQ_Line_LNQR456W,2,764230
1646,14_ST_UNION_SQ_Line_LNQR456W,3,743729
1647,14_ST_UNION_SQ_Line_LNQR456W,4,726197
1648,14_ST_UNION_SQ_Line_LNQR456W,5,755595
1649,14_ST_UNION_SQ_Line_LNQR456W,6,791544


In [45]:
#save df for further use 
df_top_traffic_2021.to_csv("./top_2021_in_2019.csv", index=False)