In [26]:
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import matplotlib.pyplot as plt
import pandas as pd


import torch
import torch.nn as nn

In [54]:
df = pd.read_csv("MTA_Subway_Hourly_Ridership_Jan_Through_May_2025.csv", low_memory=False)

This dataframe includes the number of riders, per station complex, per hour, for every hour in every day for the months January-April.

My goal is to see if I can create a model to forecast subway ridership. It should work like this: Given some ridership, that is the riders for every hour of a given time period, can I forecast the ridership in the near future? For a preliminary investigation, I will consider a lookback length of 24 hours, and a prediction horizon of 6 hours. 

Naturally, I need the ridership by hour for each 24 hour period, and the ridership of the next 6 hours, for each station, to construct my training dataset. The dataset as is needs to be transformed

Exogenous features to consider alongside time series:

- It may be important to account for the day of the week. For example, it will be more difficult to predict Saturday from Friday, than say Tuesday from Monday, due to the distinct nature of weekend and weekday ridership. 

- Furthermore, station information may need to be taken into consideration. Different stations experience different daily ridership patterns. Popular stations such as Times Square-42nd St. have a lot of ridership constantly, while some stations on the outer edges of the system may have little to no ridership some days.

    - Note! One way to go about this is to include the (normazlied) lat/long coordinates with datapoints. This is better than simply one-hot encoding by station_complex because lat/long coordinates will contain information of where stations are with respect to one another. In a previous project, I identified a strong correlation between daily ridership pattern (on weekends) and location.
      
    - Note! In this dataset, there are sometimes multiple lat/long pairs listed for a given station_complex. This phenomenon is demonstrated in lat_long_discrepancy/lat_long_discrepancy.ipynb . Because I believe this is an error, I will take the most-appearing (mode) lat/long per station_complex as that station's true lat/long.

In [187]:
# Creating a df which contains station_complex, station_complex_id, latitude, longitude
# so I can keep just station_complex_id for now, and later merge station_info_df with on=station_complex_id to 
# recover latitudes and longitudes.

# As mentioned above, I take the mode(latitude) and mode(longitude) per station_complex.

station_info_df = df[df["transit_mode"] == "subway"][['station_complex_id', 'latitude', 'longitude']].drop_duplicates().reset_index(drop= True)
station_info_df = station_info_df.groupby(["station_complex_id"], as_index = False).agg({"latitude": pd.Series.mode, "longitude": pd.Series.mode})



In [189]:
# Transforming df to a workable form

# Only considering Subway!
df = df[df["transit_mode"] == "subway"]

# Dropping Other Unneccesary Rows!
df = df.drop(["transfers", "Georeference", "station_complex", "latitude", "longitude", "borough"], axis = 1)

# Pandas datetime format.
df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'], format = "%m/%d/%Y %I:%M:%S %p")

# Keeping only date and hour, then discarding transit_timestamp
df['date'] = df['transit_timestamp'].dt.date
df['hour'] = df['transit_timestamp'].dt.hour
df.drop(["transit_timestamp"], axis = 1)

# Aggregating total ridership across different payment methods.
df = df.groupby(['station_complex_id', 'date', 'hour'], as_index = False).agg(ridership = pd.NamedAgg(column = "ridership", aggfunc = "sum"))


In [199]:
# If there were 0 riders for a certain station at a certain hour, that data is not recored into the dataset.
# Therefore, we need to fill in these values with 0 riders.

# First, create a df for the cartesian product of all possibles dates, hours, and station complexes.
all_stations = df[["station_complex_id"]].drop_duplicates().reset_index(drop=True)                   # 424 station complexes
all_hours = pd.DataFrame({'hour':[i for i in range(24)]})                                            # 24 hours
all_dates = df[['date']].drop_duplicates().reset_index(drop = True)                                  # Jan-Apr

dates_hours = pd.merge(all_dates, all_hours, how = "cross")
dates_hours_stations = pd.merge(dates_hours, all_stations, how = "cross")
# dates_hours_stations contains every possible date, hour, and station complex in the recording period.

n_missing_values = dates_hours_stations.shape[0] - df.shape[0]

# Left join dates_hours_stations with the subway dataset, so that we have df with every date, hour, and station complex, with NaNs
# where there was no data recorded in the initial dataset.
df = pd.merge(dates_hours_stations, df, on = ["date", "hour", "station_complex_id"], how = "left")

# Replace NaN values with 0.
df = df.fillna(0)

print(f"{n_missing_values} missing entries detected for stations/hours with zero ridership.")


44145 missing entries detected for stations/hours with zero ridership.


# Generating Datapoints

In [10]:
data_points = np.empty((0, 30))

def generate_data_points_by_station_complex(x):
    global data_points
    
    x = x.sort_values(by = ["date", "hour"])

    riderships_array = x["ridership"].to_numpy()
    
    data_points = np.concatenate((data_points, sliding_window_view(riderships_array, 30)), axis = 0)


df.groupby(["station_complex", "borough"]).apply(generate_data_points_by_station_complex)  


  df.groupby(["station_complex", "borough"]).apply(generate_data_points_by_station_complex)
