In [1]:
import pandas as pd

# Loading the dataset
df = pd.read_csv("MTA_Daily_Ridership.csv")

# Displaying the first few rows
df.head()

Unnamed: 0,Date,Subways: Total Estimated Ridership,Subways: % of Comparable Pre-Pandemic Day,Buses: Total Estimated Ridership,Buses: % of Comparable Pre-Pandemic Day,LIRR: Total Estimated Ridership,LIRR: % of Comparable Pre-Pandemic Day,Metro-North: Total Estimated Ridership,Metro-North: % of Comparable Pre-Pandemic Day,Access-A-Ride: Total Scheduled Trips,Access-A-Ride: % of Comparable Pre-Pandemic Day,Bridges and Tunnels: Total Traffic,Bridges and Tunnels: % of Comparable Pre-Pandemic Day,Staten Island Railway: Total Estimated Ridership,Staten Island Railway: % of Comparable Pre-Pandemic Day
0,2020-03-01,2212965,97,984908,99,86790,100,55825,59,19922,113,786960,98,1636,52
1,2020-03-02,5329915,96,2209066,99,321569,103,180701,66,30338,102,874619,95,17140,107
2,2020-03-03,5481103,98,2228608,99,319727,102,190648,69,32767,110,882175,96,17453,109
3,2020-03-04,5498809,99,2177165,97,311662,99,192689,70,34297,115,905558,98,17136,107
4,2020-03-05,5496453,99,2244515,100,307597,98,194386,70,33209,112,929298,101,17203,108


## Inspecting the dataset

In [2]:
# Checking basic info about the dataset
df.info()

# Checking if there are missing values
df.isnull().sum()

# Checking for duplicate rows
df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1706 entries, 0 to 1705
Data columns (total 15 columns):
 #   Column                                                   Non-Null Count  Dtype 
---  ------                                                   --------------  ----- 
 0   Date                                                     1706 non-null   object
 1   Subways: Total Estimated Ridership                       1706 non-null   int64 
 2   Subways: % of Comparable Pre-Pandemic Day                1706 non-null   int64 
 3   Buses: Total Estimated Ridership                         1706 non-null   int64 
 4   Buses: % of Comparable Pre-Pandemic Day                  1706 non-null   int64 
 5   LIRR: Total Estimated Ridership                          1706 non-null   int64 
 6   LIRR: % of Comparable Pre-Pandemic Day                   1706 non-null   int64 
 7   Metro-North: Total Estimated Ridership                   1706 non-null   int64 
 8   Metro-North: % of Comparable Pre-Pande

0

## Renaming columns for easier access

In [3]:
# Renaming columns
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace(":", "").str.lower()
df.head()

Unnamed: 0,date,subways_total_estimated_ridership,subways_%_of_comparable_pre-pandemic_day,buses_total_estimated_ridership,buses_%_of_comparable_pre-pandemic_day,lirr_total_estimated_ridership,lirr_%_of_comparable_pre-pandemic_day,metro-north_total_estimated_ridership,metro-north_%_of_comparable_pre-pandemic_day,access-a-ride_total_scheduled_trips,access-a-ride_%_of_comparable_pre-pandemic_day,bridges_and_tunnels_total_traffic,bridges_and_tunnels_%_of_comparable_pre-pandemic_day,staten_island_railway_total_estimated_ridership,staten_island_railway_%_of_comparable_pre-pandemic_day
0,2020-03-01,2212965,97,984908,99,86790,100,55825,59,19922,113,786960,98,1636,52
1,2020-03-02,5329915,96,2209066,99,321569,103,180701,66,30338,102,874619,95,17140,107
2,2020-03-03,5481103,98,2228608,99,319727,102,190648,69,32767,110,882175,96,17453,109
3,2020-03-04,5498809,99,2177165,97,311662,99,192689,70,34297,115,905558,98,17136,107
4,2020-03-05,5496453,99,2244515,100,307597,98,194386,70,33209,112,929298,101,17203,108


## Converting Date Column to Datetime Format

In [4]:
# Converting 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

## Convert Percentage Columns to Numeric Format

In [10]:
# Checking the exact column names
print(df.columns)

Index(['date', 'subways_total_estimated_ridership',
       'subways_%_of_comparable_pre-pandemic_day',
       'buses_total_estimated_ridership',
       'buses_%_of_comparable_pre-pandemic_day',
       'lirr_total_estimated_ridership',
       'lirr_%_of_comparable_pre-pandemic_day',
       'metro-north_total_estimated_ridership',
       'metro-north_%_of_comparable_pre-pandemic_day',
       'access-a-ride_total_scheduled_trips',
       'access-a-ride_%_of_comparable_pre-pandemic_day',
       'bridges_and_tunnels_total_traffic',
       'bridges_and_tunnels_%_of_comparable_pre-pandemic_day',
       'staten_island_railway_total_estimated_ridership',
       'staten_island_railway_%_of_comparable_pre-pandemic_day'],
      dtype='object')


In [13]:
# Standardizing column names (remove special characters and lowercase)
df.columns = (
    df.columns.str.strip()  # Remove leading/trailing spaces
    .str.replace(" ", "_")  # Replace spaces with underscores
    .str.replace("%", "percent")  # Replace % with 'percent'
    .str.replace("-", "_")  # Replace hyphens with underscores
    .str.lower()  # Convert to lowercase
)

# Displaying the updated column names
print(df.columns)

Index(['date', 'subways_total_estimated_ridership',
       'subways_percent_of_comparable_pre_pandemic_day',
       'buses_total_estimated_ridership',
       'buses_percent_of_comparable_pre_pandemic_day',
       'lirr_total_estimated_ridership',
       'lirr_percent_of_comparable_pre_pandemic_day',
       'metro_north_total_estimated_ridership',
       'metro_north_percent_of_comparable_pre_pandemic_day',
       'access_a_ride_total_scheduled_trips',
       'access_a_ride_percent_of_comparable_pre_pandemic_day',
       'bridges_and_tunnels_total_traffic',
       'bridges_and_tunnels_percent_of_comparable_pre_pandemic_day',
       'staten_island_railway_total_estimated_ridership',
       'staten_island_railway_percent_of_comparable_pre_pandemic_day'],
      dtype='object')


In [16]:
df.dtypes

date                                                            datetime64[ns]
subways_total_estimated_ridership                                        int64
subways_percent_of_comparable_pre_pandemic_day                           int64
buses_total_estimated_ridership                                          int64
buses_percent_of_comparable_pre_pandemic_day                             int64
lirr_total_estimated_ridership                                           int64
lirr_percent_of_comparable_pre_pandemic_day                              int64
metro_north_total_estimated_ridership                                    int64
metro_north_percent_of_comparable_pre_pandemic_day                       int64
access_a_ride_total_scheduled_trips                                      int64
access_a_ride_percent_of_comparable_pre_pandemic_day                     int64
bridges_and_tunnels_total_traffic                                        int64
bridges_and_tunnels_percent_of_comparable_pre_pandem

In [19]:
# Converting percentage columns from int64 to float64
percentage_columns = [
    'subways_percent_of_comparable_pre_pandemic_day',
    'buses_percent_of_comparable_pre_pandemic_day',
    'lirr_percent_of_comparable_pre_pandemic_day',
    'metro_north_percent_of_comparable_pre_pandemic_day',
    'access_a_ride_percent_of_comparable_pre_pandemic_day',
    'bridges_and_tunnels_percent_of_comparable_pre_pandemic_day',
    'staten_island_railway_percent_of_comparable_pre_pandemic_day'
]

df[percentage_columns] = df[percentage_columns].astype(float)

# Verifying the changes
df.dtypes

date                                                            datetime64[ns]
subways_total_estimated_ridership                                        int64
subways_percent_of_comparable_pre_pandemic_day                         float64
buses_total_estimated_ridership                                          int64
buses_percent_of_comparable_pre_pandemic_day                           float64
lirr_total_estimated_ridership                                           int64
lirr_percent_of_comparable_pre_pandemic_day                            float64
metro_north_total_estimated_ridership                                    int64
metro_north_percent_of_comparable_pre_pandemic_day                     float64
access_a_ride_total_scheduled_trips                                      int64
access_a_ride_percent_of_comparable_pre_pandemic_day                   float64
bridges_and_tunnels_total_traffic                                        int64
bridges_and_tunnels_percent_of_comparable_pre_pandem

In [21]:
percentage_columns = [
    'subways_percent_of_comparable_pre_pandemic_day',
    'buses_percent_of_comparable_pre_pandemic_day',
    'lirr_percent_of_comparable_pre_pandemic_day',
    'metro_north_percent_of_comparable_pre_pandemic_day',
    'access_a_ride_percent_of_comparable_pre_pandemic_day',
    'bridges_and_tunnels_percent_of_comparable_pre_pandemic_day',
    'staten_island_railway_percent_of_comparable_pre_pandemic_day'
]

# Print data types of percentage columns
print(df.dtypes[percentage_columns])

subways_percent_of_comparable_pre_pandemic_day                  float64
buses_percent_of_comparable_pre_pandemic_day                    float64
lirr_percent_of_comparable_pre_pandemic_day                     float64
metro_north_percent_of_comparable_pre_pandemic_day              float64
access_a_ride_percent_of_comparable_pre_pandemic_day            float64
bridges_and_tunnels_percent_of_comparable_pre_pandemic_day      float64
staten_island_railway_percent_of_comparable_pre_pandemic_day    float64
dtype: object


## Handling Missing Values 

In [27]:
# Checking missing values again
df.isnull().sum()

# Filling missing values (if any)
df.fillna(0, inplace=True)

## Removing Duplicates

In [28]:
df.duplicated().sum()

0

In [29]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

# Key Data Analysis Questions

## 1️⃣ COVID-19 Impact on Ridership

### How did subway, bus, LIRR, and Metro-North ridership change over time during the pandemic?
### How do current ridership levels compare to pre-pandemic levels?
### Which transportation mode recovered the fastest after the initial drop?
### Did different transportation services experience different rates of ridership decline and recovery?

## 2️⃣ Transportation Mode Analysis

### Which transportation service (subway, bus, LIRR, Metro-North, etc.) saw the largest drop in ridership during COVID-19?
### Which service recovered the quickest post-pandemic?
### Are ridership trends similar across subways, buses, LIRR, Metro-North, and Access-A-Ride, or do they differ?
### Did the decline in subway ridership lead to an increase in bus or other transit usage?

## 3️⃣ Peak Ridership Trends 

### What are the busiest days of the week for each mode of transport?
### Are there specific months or seasons with higher or lower ridership?
### How does ridership fluctuate on weekdays vs. weekends?
### What are the highest and lowest ridership days recorded?
### Are there patterns indicating holiday or event-based spikes in ridership?

## 4️⃣ Ridership Percentage vs. Pre-Pandemic Levels

### Is there a strong correlation between ridership and pre-pandemic percentages?
### Which transit mode has returned closest to pre-pandemic levels?
### How long did it take for each service to return to X% of pre-pandemic ridership?

## 5️⃣ Interdependencies Between Transit Services

### Is subway ridership correlated with bus ridership? (e.g., do more people take buses when subway usage is down?)
### Do bridge/tunnel traffic trends mirror changes in public transit ridership?
### Did declines in Access-A-Ride trips impact overall public transit usage?

## 6️⃣ External Factors Influencing Ridership

### Did major policy changes (e.g., fare changes, government restrictions) impact ridership?
### How did ridership respond to key milestones in reopening (e.g., office returns, school reopenings)?
### Were there weather-related dips in ridership?