In [129]:
import pandas as pd
df = pd.read_csv(
    "https://data.ny.gov/resource/wujg-7c2s.csv?$limit=500000000&$select=station_complex_id,transit_timestamp,sum(ridership)&$group=station_complex_id,transit_timestamp&$where=transit_timestamp>'2023-01-01T00:00:00'"
)
# Hitting our magic endpoint -- this aggregates the complex ids, unique timestamps (date + hour) and ridership across all payments

In [3]:
from utils import get_mean_hourly_ridership, get_station_ridership


def get_mean_hourly_ridership(
    csv: str = "https://data.ny.gov/resource/wujg-7c2s.csv?$limit=500000000&$select=station_complex_id,transit_timestamp,sum(ridership)&$group=station_complex_id,transit_timestamp&$where=transit_timestamp>'2023-01-01T00:00:00'",
):
    """This function reads in the data from the csv file and returns the mean hourly ridership for each station complex."""
    df = pd.read_csv(csv)  # Read in the data.
    df["hours"] = df["transit_timestamp"].apply(
        lambda x: x.split("T")[1].split(":")[0]
    )  # Adding a column of just the hours (24 hour military scale).

    return (
        df.groupby(["hours", "station_complex_id"])["sum_ridership"]
        .mean()
        .reset_index()
    ) # Now, we get the mean ridership for each station complex for each hour of the day. 


get_mean_hourly_ridership() # Our output is an unordered table of complex ids, hours, and their annual mean ridership at that hour (across all days)

Unnamed: 0,hours,station_complex_id,sum_ridership
0,00,1,39.787402
1,00,10,293.478056
2,00,100,19.335962
3,00,101,96.899682
4,00,103,77.716443
...,...,...,...
10267,23,97,96.298742
10268,23,98,54.735016
10269,23,99,37.529874
10270,23,TRAM1,72.079937


In [2]:
# Get the mean hourly ridership at midnight for every day for complex station 628.

def get_station_ridership(
    complex_id: str,
    hour: str,
    df: pd.DataFrame,
):
    return dict(
        df[(df["station_complex_id"] == complex_id) & (df["hours"] == hour)].iloc[0] # Now, we need a consistent way to filter to get a station's ridership at a given hour.
    )


get_station_ridership(
    "628", "0", get_mean_hourly_ridership()
)

# Nice! Now we can set up an endpoint for the front end to retrieve specific metrics for a station at a given hour.

{'hours': '00',
 'station_complex_id': '628',
 'sum_ridership': np.float64(434.6332288401254)}

In [22]:
import pandas as pd
df = pd.read_json("outputs/new_ridership.json")


def get_station_ridership(
    complex_id: str,
    hour: int,
    df: pd.DataFrame,
):
    df["hours"] = df["hours"].astype(str)
    return dict(df[(df["station_complex_id"] == complex_id) & (df["hours"] == hour)].iloc[0])


get_station_ridership("628", "0", df)

{'hours': '0',
 'station_complex_id': '628',
 'sum_ridership': np.float64(434.6332288401)}

In [47]:
import pandas as pd

df = pd.read_json("outputs/new_ridership.json")
stations = ["10", "628", "TRAM1"]
filtered_df = df[df["station_complex_id"].isin(stations) & (df["hours"] == 12)][[
    "station_complex_id", "sum_ridership"]
].set_index("station_complex_id")
dict(filtered_df.T)['10']

sum_ridership    620.007825
Name: 10, dtype: float64

In [16]:
import pandas as pd
df = pd.read_json("outputs/new_ridership.json")

def time_of_day(hour: int):
    if hour in range(3, 7):
        return "Early Morning"
    elif hour in range(7, 11):
        return "Morning"
    elif hour in range(11, 15):
        return "Midday"
    elif hour in range(15, 19):
        return "Afternoon"
    elif hour in range(19, 23):
        return "Evening"
    else:
        return "Overnight"

#Now, we need to define our strata for the time of day. We need to do this, as 
df["time_of_day"] = df["hours"].apply(lambda x: time_of_day(int(x)))
df[df["time_of_day"] == "Overnight"]['hours'].unique() # Should only be the range of [23, 0, 1, 2]

array([ 0,  1,  2, 23])

In [18]:
# Nice, now we write this out to use as a 'database' for our app. Pre-processing helps reduce the response time, since we're avoiding pandas computations (albeit, vectorized).
df.to_json("outputs/new_ridership_times.json")
pd.read_json("outputs/new_ridership_times.json")

Unnamed: 0,hours,station_complex_id,sum_ridership,time_of_day
0,0,1,39.787402,Overnight
1,0,10,293.478056,Overnight
2,0,100,19.335962,Overnight
3,0,101,96.899682,Overnight
4,0,103,77.716443,Overnight
...,...,...,...,...
10267,23,97,96.298742,Overnight
10268,23,98,54.735016,Overnight
10269,23,99,37.529874,Overnight
10270,23,TRAM1,72.079937,Overnight
