In [9]:
import pandas as pd
from sodapy import Socrata

# Initialize the Socrata client
client = Socrata("data.ny.gov", None)

# Fetch the data with a reasonable limit
results = client.get("wujg-7c2s", limit=5000)

# Convert results to DataFrame
df = pd.DataFrame.from_records(results)

# Convert 'transit_timestamp' to datetime
df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])

# Convert 'ridership' to numeric
df['ridership'] = pd.to_numeric(df['ridership'], errors='coerce')

print(df.head())




    transit_timestamp transit_mode station_complex_id    station_complex  \
0 2023-09-13 09:00:00       subway                283  Greenpoint Av (G)   
1 2023-03-24 10:00:00       subway                283  Greenpoint Av (G)   
2 2023-02-16 20:00:00       subway                283  Greenpoint Av (G)   
3 2023-06-27 21:00:00       subway                283  Greenpoint Av (G)   
4 2023-01-18 13:00:00       subway                283  Greenpoint Av (G)   

    borough payment_method    fare_class_category  ridership transfers  \
0  Brooklyn      metrocard      Metrocard - Other       19.0       0.0   
1  Brooklyn      metrocard      Metrocard - Other       13.0       0.0   
2  Brooklyn      metrocard      Metrocard - Other        2.0       0.0   
3  Brooklyn      metrocard  Metrocard - Full Fare       23.0       2.0   
4  Brooklyn      metrocard  Metrocard - Full Fare       64.0       1.0   

             latitude           longitude  \
0           40.731354           -73.95445   
1  40.73

In [10]:
def get_average_ridership_by_station_and_time(dataframe, station_id, query_time):
    """
    Retrieve the average ridership for a given station and time, across the same day of the week.
    
    :param dataframe: DataFrame containing the MTA data
    :param station_id: ID or name of the station complex
    :param query_time: datetime or string representing the time to query (e.g., '08:00:00')
    :return: Average ridership for the given time and day of the week
    """
    # Ensure query_time is a datetime object for consistency
    if isinstance(query_time, str):
        query_time = pd.to_datetime(query_time)

    # Filter data for the given station
    station_filtered = dataframe[
        (dataframe['station_complex_id'] == station_id) |
        (dataframe['station_complex'].str.contains(station_id, case=False, na=False))
    ].copy()

    # Safely create new columns using loc
    station_filtered.loc[:, 'day_of_week'] = station_filtered['transit_timestamp'].dt.dayofweek
    station_filtered.loc[:, 'hour'] = station_filtered['transit_timestamp'].dt.hour

    # Filter by day of the week and hour
    day_hour_filtered = station_filtered[
        (station_filtered['day_of_week'] == query_time.dayofweek) &
        (station_filtered['hour'] == query_time.hour)
    ]

    # Calculate the average ridership
    average_ridership = day_hour_filtered['ridership'].mean()

    return average_ridership

# Example usage
station_complex_id = '283'  # Example Station ID or part of the name
query_time = '2021-01-01 08:00:00'  # Example query time
average_ridership = get_average_ridership_by_station_and_time(df, station_complex_id, query_time)
print(f"Average ridership for station {station_complex_id} on the same day of the week and time: {average_ridership}")


Average ridership for station 283 on the same day of the week and time: 39.3235294117647
