# MTA Project Practice

In [3]:
from sqlalchemy import create_engine
from sqlalchemy import inspect

import pandas as pd
import datetime

In [4]:
#weeks=[210501, 210508, 210515, 210522, 210529, 210605,210612, 210619, 210626, 210710, 210717, 210724, 210731, 210807, 210814, 210821, 210828, 210904]
weeks=[210605,210612, 210619, 210626, 210710, 210717, 210724, 210731, 210807, 210814, 210821, 210828]


In [5]:
def get_data(weeks):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week in weeks:
        data_url = url.format(week)
        dfs.append(pd.read_csv(data_url))
    return pd.concat(dfs)

In [6]:
turnstiles_df = get_data(weeks)

In [7]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,00:00:00,REGULAR,7578734,2590325
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,04:00:00,REGULAR,7578740,2590327
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,08:00:00,REGULAR,7578749,2590340
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,12:00:00,REGULAR,7578789,2590386
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,16:00:00,REGULAR,7578897,2590418


In [8]:
turnstiles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2512780 entries, 0 to 209067
Data columns (total 11 columns):
 #   Column                                                                Dtype 
---  ------                                                                ----- 
 0   C/A                                                                   object
 1   UNIT                                                                  object
 2   SCP                                                                   object
 3   STATION                                                               object
 4   LINENAME                                                              object
 5   DIVISION                                                              object
 6   DATE                                                                  object
 7   TIME                                                                  object
 8   DESC                                                           

In [9]:
turnstiles_df.describe()

Unnamed: 0,ENTRIES,EXITS
count,2512780.0,2512780.0
mean,41638210.0,33288140.0
std,218330400.0,191948000.0
min,0.0,0.0
25%,217912.0,99494.0
50%,1405159.0,854944.5
75%,6021537.0,3968502.0
max,2147417000.0,2122890000.0


In [10]:
turnstiles_df.shape

(2512780, 11)

In [11]:
#Detect if any missing Values ==> no
turnstiles_df.isna().sum()

C/A                                                                     0
UNIT                                                                    0
SCP                                                                     0
STATION                                                                 0
LINENAME                                                                0
DIVISION                                                                0
DATE                                                                    0
TIME                                                                    0
DESC                                                                    0
ENTRIES                                                                 0
EXITS                                                                   0
dtype: int64

In [12]:
#check Columns format
turnstiles_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [13]:
#Rename/Clean coluns format : remove the ending spaces
turnstiles_df.columns = [column.strip() for column in turnstiles_df.columns]

turnstiles_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [14]:
turnstiles_df.DESC.value_counts().sort_index()

RECOVR AUD      10776
REGULAR       2502004
Name: DESC, dtype: int64

In [15]:
# count the total numbers of data for each week 
turnstiles_df.DATE.value_counts().sort_index()
turnstiles_df.DATE.value_counts()

07/06/2021    30226
07/17/2021    30193
08/24/2021    30191
06/08/2021    30151
07/03/2021    30114
              ...  
08/26/2021    29803
06/06/2021    29790
07/21/2021    29772
07/07/2021    29726
08/27/2021    29537
Name: DATE, Length: 84, dtype: int64

In [16]:
turnstiles_df["DATE_TIME"] = pd.to_datetime( turnstiles_df.DATE + " "+ turnstiles_df.TIME,
                                            format="%m/%d/%Y %H:%M:%S")
turnstiles_df["DATE_TIME"]

0        2021-05-29 00:00:00
1        2021-05-29 04:00:00
2        2021-05-29 08:00:00
3        2021-05-29 12:00:00
4        2021-05-29 16:00:00
                 ...        
209063   2021-08-27 05:00:00
209064   2021-08-27 09:00:00
209065   2021-08-27 13:00:00
209066   2021-08-27 17:00:00
209067   2021-08-27 21:00:00
Name: DATE_TIME, Length: 2512780, dtype: datetime64[ns]

In [17]:
turnstiles_df.dtypes

C/A                  object
UNIT                 object
SCP                  object
STATION              object
LINENAME             object
DIVISION             object
DATE                 object
TIME                 object
DESC                 object
ENTRIES               int64
EXITS                 int64
DATE_TIME    datetime64[ns]
dtype: object

In [18]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,00:00:00,REGULAR,7578734,2590325,2021-05-29 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,04:00:00,REGULAR,7578740,2590327,2021-05-29 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,08:00:00,REGULAR,7578749,2590340,2021-05-29 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,12:00:00,REGULAR,7578789,2590386,2021-05-29 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,16:00:00,REGULAR,7578897,2590418,2021-05-29 16:00:00


In [19]:
mask = ((turnstiles_df["C/A"] == "A002") & 
(turnstiles_df["UNIT"] == "R051") & 
(turnstiles_df["SCP"] == "02-00-00") & 
(turnstiles_df["STATION"] == "59 ST") &
(turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2021, 5, 29).date()))

In [20]:
turnstiles_df[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,00:00:00,REGULAR,7578734,2590325,2021-05-29 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,04:00:00,REGULAR,7578740,2590327,2021-05-29 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,08:00:00,REGULAR,7578749,2590340,2021-05-29 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,12:00:00,REGULAR,7578789,2590386,2021-05-29 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,16:00:00,REGULAR,7578897,2590418,2021-05-29 16:00:00
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,20:00:00,REGULAR,7579021,2590439,2021-05-29 20:00:00


In [21]:
# check duplicate records 
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
.EXITS.count()
.reset_index()
.sort_values("EXITS", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,EXITS
2266804,R516,R291,00-00-00,33 ST-RAWSON ST,2021-08-25 12:00:00,2
2286936,R523,R147,00-00-07,61 ST WOODSIDE,2021-08-18 05:00:00,2
2266799,R516,R291,00-00-00,33 ST-RAWSON ST,2021-08-24 16:00:00,2
2266800,R516,R291,00-00-00,33 ST-RAWSON ST,2021-08-24 20:00:00,2
2266801,R516,R291,00-00-00,33 ST-RAWSON ST,2021-08-25 00:00:00,2


In [32]:
# export data to csv file 
turnstiles_df.to_csv (r'MTA_2021.csv', index = False, header=True)

In [22]:
mask = ((turnstiles_df["C/A"] == "R516") & 
(turnstiles_df["UNIT"] == "R291") & 
(turnstiles_df["SCP"] == "00-00-00") & 
(turnstiles_df["STATION"] == "33 ST-RAWSON ST") &
(turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2021, 8, 26).date()))

In [23]:
turnstiles_df[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
188785,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,00:00:00,REGULAR,8291032,13148994,2021-08-26 00:00:00
188786,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,00:00:00,RECOVR AUD,2073806,1083850,2021-08-26 00:00:00
188787,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,04:00:00,REGULAR,8291032,13149023,2021-08-26 04:00:00
188788,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,08:00:00,REGULAR,8291049,13149336,2021-08-26 08:00:00
188789,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,08:00:00,RECOVR AUD,2073825,1083952,2021-08-26 08:00:00
188790,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,12:00:00,REGULAR,8291075,13149753,2021-08-26 12:00:00
188791,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,16:00:00,REGULAR,8291133,13149865,2021-08-26 16:00:00
188792,R516,R291,00-00-00,33 ST-RAWSON ST,7,IRT,08/26/2021,20:00:00,REGULAR,8291237,13149965,2021-08-26 20:00:00


In [24]:
# remove RECOVER_AUD
regular_mask = (turnstiles_df.DESC == "REGULAR")
turnstiles_df_cleaned = turnstiles_df[regular_mask]


In [25]:
turnstiles_df.DESC.value_counts()

REGULAR       2502004
RECOVR AUD      10776
Name: DESC, dtype: int64

In [26]:
turnstiles_df_cleaned.DESC.value_counts()

REGULAR    2502004
Name: DESC, dtype: int64

In [27]:
#check duplicate records 
(turnstiles_df_cleaned
.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
.EXITS.count()
.reset_index()
.sort_values("EXITS", ascending=False))



Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,EXITS
0,A002,R051,02-00-00,59 ST,2021-05-29 00:00:00,1
1668006,R138,R293,00-03-01,34 ST-PENN STA,2021-07-17 02:00:00,1
1667999,R138,R293,00-03-01,34 ST-PENN STA,2021-07-15 22:00:00,1
1668000,R138,R293,00-03-01,34 ST-PENN STA,2021-07-16 02:00:00,1
1668001,R138,R293,00-03-01,34 ST-PENN STA,2021-07-16 06:00:00,1
...,...,...,...,...,...,...
834001,N128,R200,00-00-00,EUCLID AV,2021-08-18 12:00:00,1
834002,N128,R200,00-00-00,EUCLID AV,2021-08-18 16:00:00,1
834003,N128,R200,00-00-00,EUCLID AV,2021-08-18 20:00:00,1
834004,N128,R200,00-00-00,EUCLID AV,2021-08-19 00:00:00,1


In [28]:
turnstiles_df_cleaned.shape

(2502004, 12)

In [30]:
#check if any duplicate entries

#len(turnstiles_df_cleaned[["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"]].drop_duplicates())
turnstiles_df_cleaned[["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"]].drop_duplicates().shape


(2502004, 5)

In [40]:
turnstiles_df_cleaned.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,ID_turnstiles
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,00:00:00,REGULAR,7578734,2590325,2021-05-29 00:00:00,A002-R051-02-00-00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,04:00:00,REGULAR,7578740,2590327,2021-05-29 04:00:00,A002-R051-02-00-00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,08:00:00,REGULAR,7578749,2590340,2021-05-29 08:00:00,A002-R051-02-00-00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,12:00:00,REGULAR,7578789,2590386,2021-05-29 12:00:00,A002-R051-02-00-00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,16:00:00,REGULAR,7578897,2590418,2021-05-29 16:00:00,A002-R051-02-00-00
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/29/2021,20:00:00,REGULAR,7579021,2590439,2021-05-29 20:00:00,A002-R051-02-00-00
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/30/2021,00:00:00,REGULAR,7579078,2590451,2021-05-30 00:00:00,A002-R051-02-00-00
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/30/2021,04:00:00,REGULAR,7579084,2590452,2021-05-30 04:00:00,A002-R051-02-00-00
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/30/2021,08:00:00,REGULAR,7579085,2590454,2021-05-30 08:00:00,A002-R051-02-00-00
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/30/2021,12:00:00,REGULAR,7579093,2590493,2021-05-30 12:00:00,A002-R051-02-00-00


In [41]:
turnstiles_df_cleaned['ID_turnstiles'] = turnstiles_df_cleaned[['C/A', 'UNIT', 'SCP']].agg(''.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  turnstiles_df_cleaned['ID_turnstiles'] = turnstiles_df_cleaned[['C/A', 'UNIT', 'SCP']].agg(''.join, axis=1)


In [42]:
turnstiles_df_cleaned['ID'] = turnstiles_df_cleaned[['ID_turnstiles','DATE_TIME']].agg('-'.join, axis=1)

TypeError: sequence item 1: expected str instance, Timestamp found

In [49]:
turnstiles_df_csv = turnstiles_df_cleaned.drop(['DATE', 'TIME'],axis = 1)
turnstiles_df_csv.to_csv (r'MTA_Data_2021.csv', index = False, header=True)

In [33]:
#  Obtain the maximum `EXITS` value for each day, for each unique turnstile.

turnstiles_daily = (turnstiles_df_cleaned
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False)
                        .EXITS.first())

In [39]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,EXITS
0,A002,R051,02-00-00,59 ST,05/29/2021,2590325
1,A002,R051,02-00-00,59 ST,05/30/2021,2590451
2,A002,R051,02-00-00,59 ST,05/31/2021,2590561
3,A002,R051,02-00-00,59 ST,06/01/2021,2590651
4,A002,R051,02-00-00,59 ST,06/02/2021,2590926


### Problem 3b
- Use the daily maximum `ENTRIES` calculations from Problem 3a. Recall that the `ENTRIES` column contains **cumulative entries** on each day. We would now like you to calculate **daily entries**, i.e. the number of new entries gained each day.
- *Hint:* Group the data by turnstile. Check out the `.shift()` and `.diff()` DataFrame methods for this purpose. *Ensure things make sense;* you *may* need to use the  `.apply()` method for abnormalities.