In [129]:
import pandas as pd
df = pd.read_csv(
    "https://data.ny.gov/resource/wujg-7c2s.csv?$limit=500000000&$select=station_complex_id,transit_timestamp,sum(ridership)&$group=station_complex_id,transit_timestamp&$where=transit_timestamp>'2023-01-01T00:00:00'"
)
# Hitting our magic endpoint -- this aggregates the complex ids, unique timestamps (date + hour) and ridership across all payments

In [3]:
from utils import get_mean_hourly_ridership, get_station_ridership


def get_mean_hourly_ridership(
    csv: str = "https://data.ny.gov/resource/wujg-7c2s.csv?$limit=500000000&$select=station_complex_id,transit_timestamp,sum(ridership)&$group=station_complex_id,transit_timestamp&$where=transit_timestamp>'2023-01-01T00:00:00'",
):
    """This function reads in the data from the csv file and returns the mean hourly ridership for each station complex."""
    df = pd.read_csv(csv)  # Read in the data.
    df["hours"] = df["transit_timestamp"].apply(
        lambda x: x.split("T")[1].split(":")[0]
    )  # Adding a column of just the hours (24 hour military scale).

    return (
        df.groupby(["hours", "station_complex_id"])["sum_ridership"]
        .mean()
        .reset_index()
    )


get_mean_hourly_ridership()

Unnamed: 0,hours,station_complex_id,sum_ridership
0,00,1,39.787402
1,00,10,293.478056
2,00,100,19.335962
3,00,101,96.899682
4,00,103,77.716443
...,...,...,...
10267,23,97,96.298742
10268,23,98,54.735016
10269,23,99,37.529874
10270,23,TRAM1,72.079937


In [2]:
# Get the mean hourly ridership at midnight for every day for complex station 628.

def get_station_ridership(
    complex_id: str,
    hour: str,
    df: pd.DataFrame,
):
    return dict(
        df[(df["station_complex_id"] == complex_id) & (df["hours"] == hour)].iloc[0]
    )


get_station_ridership(
    "628", "00", get_mean_hourly_ridership()
)

{'hours': '00',
 'station_complex_id': '628',
 'sum_ridership': np.float64(434.6332288401254)}