In [56]:
import pandas as pd
import numpy as np
from sodapy import Socrata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [11]:
# Initialize the Socrata client and fetch data
client = Socrata("data.ny.gov", None)
results = client.get("wujg-7c2s", limit=5000)
results




[{'transit_timestamp': '2023-01-01T01:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '284',
  'station_complex': 'Nassau Av (G)',
  'borough': 'Brooklyn',
  'payment_method': 'metrocard',
  'fare_class_category': 'Metrocard - Fair Fare',
  'ridership': '1.0',
  'transfers': '0.0',
  'latitude': '40.72463607788086',
  'longitude': '-73.95127868652344',
  'georeference': {'type': 'Point',
   'coordinates': [-73.95127868652344, 40.72463607788086]},
  ':@computed_region_kjdx_g34t': '2090',
  ':@computed_region_yamh_8v7k': '894',
  ':@computed_region_wbg7_3whc': '901'},
 {'transit_timestamp': '2023-07-26T05:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '283',
  'station_complex': 'Greenpoint Av (G)',
  'borough': 'Brooklyn',
  'payment_method': 'omny',
  'fare_class_category': 'OMNY - Full Fare',
  'ridership': '28.0',
  'transfers': '0.0',
  'latitude': '40.731353759765625',
  'longitude': '-73.95445251464844',
  'georeference': {'type': 'Point',
   'coor

In [15]:

# Convert results to DataFrame
df = pd.DataFrame.from_records(results)

# Convert 'transit_timestamp' to datetime
df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])

# Convert 'ridership' to numeric
df['ridership'] = pd.to_numeric(df['ridership'], errors='coerce')

print(df.head())
df

    transit_timestamp transit_mode station_complex_id    station_complex  \
0 2023-01-01 01:00:00       subway                284      Nassau Av (G)   
1 2023-07-26 05:00:00       subway                283  Greenpoint Av (G)   
2 2023-03-27 20:00:00       subway                283  Greenpoint Av (G)   
3 2023-09-11 22:00:00       subway                283  Greenpoint Av (G)   
4 2023-02-22 19:00:00       subway                283  Greenpoint Av (G)   

    borough payment_method          fare_class_category  ridership transfers  \
0  Brooklyn      metrocard        Metrocard - Fair Fare        1.0       0.0   
1  Brooklyn           omny             OMNY - Full Fare       28.0       0.0   
2  Brooklyn      metrocard  Metrocard - Unlimited 7-Day       23.0       0.0   
3  Brooklyn           omny             OMNY - Full Fare       77.0       0.0   
4  Brooklyn           omny             OMNY - Full Fare      238.0       2.0   

             latitude           longitude  \
0   40.7246360778

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
0,2023-01-01 01:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,1.0,0.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901
1,2023-07-26 05:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,28.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
2,2023-03-27 20:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,23.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
3,2023-09-11 22:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,77.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4,2023-02-22 19:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,238.0,2.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2023-06-30 08:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,31.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4996,2023-01-20 18:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,435.0,4.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4997,2023-04-18 09:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Seniors & Disability,2.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4998,2023-08-08 15:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,222.0,1.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901


In [8]:
def get_average_ridership_by_station_and_time(dataframe, station_id, query_time):
    """
    Retrieve the average ridership for a given station and time, across the same day of the week.
    
    :param dataframe: DataFrame containing the MTA data
    :param station_id: ID or name of the station complex
    :param query_time: datetime or string representing the time to query (e.g., '08:00:00')
    :return: Average ridership for the given time and day of the week
    """
    # Ensure query_time is a datetime object for consistency
    if isinstance(query_time, str):
        query_time = pd.to_datetime(query_time)

    # Filter data for the given station
    station_filtered = dataframe[
        (dataframe['station_complex_id'] == station_id) |
        (dataframe['station_complex'].str.contains(station_id, case=False, na=False))
    ].copy()

    # Safely create new columns using loc
    station_filtered.loc[:, 'day_of_week'] = station_filtered['transit_timestamp'].dt.dayofweek
    station_filtered.loc[:, 'hour'] = station_filtered['transit_timestamp'].dt.hour

    # Filter by day of the week and hour
    day_hour_filtered = station_filtered[
        (station_filtered['day_of_week'] == query_time.dayofweek) &
        (station_filtered['hour'] == query_time.hour)
    ]

    # Calculate the average ridership
    average_ridership = day_hour_filtered['ridership'].mean()

    return average_ridership



In [9]:
# Example usage
station_complex_id = '283'  # Example Station ID or part of the name
query_time = '2021-01-01 08:00:00'  # Example query time
average_ridership = get_average_ridership_by_station_and_time(df, station_complex_id, query_time)
print(f"Average ridership for station {station_complex_id} on the same day of the week and time: {average_ridership}")

Average ridership for station 283 on the same day of the week and time: 33.666666666666664


In [47]:
earliest_timestamp = df['transit_timestamp'].min()
earliest_row = df[df['transit_timestamp'] == earliest_timestamp]

# Get the latest transit_timestamp
latest_timestamp = df['transit_timestamp'].max()
latest_row = df[df['transit_timestamp'] == latest_timestamp]

print("Earliest time:")
print(earliest_row["transit_timestamp"].unique()[0])
print("\nLatest time:")
print(latest_row["transit_timestamp"].unique()[0])

print("\nEarliest row(s):")
print(earliest_row)
print("\nLatest row(s):")
print(latest_row)

Earliest time:
2023-01-01 00:00:00

Latest time:
2024-03-28 18:00:00

Earliest row(s):
     transit_timestamp transit_mode station_complex_id    station_complex  \
3706        2023-01-01       subway                283  Greenpoint Av (G)   

       borough payment_method fare_class_category  ridership transfers  \
3706  Brooklyn           omny    OMNY - Full Fare       90.0       0.0   

                latitude           longitude  \
3706  40.731353759765625  -73.95445251464844   

                                           georeference  \
3706  {'type': 'Point', 'coordinates': [-73.95445251...   

     :@computed_region_kjdx_g34t :@computed_region_yamh_8v7k  \
3706                        2090                         894   

     :@computed_region_wbg7_3whc  
3706                         901  

Latest row(s):
       transit_timestamp transit_mode station_complex_id  \
2976 2024-03-28 18:00:00       subway                 45   
2978 2024-03-28 18:00:00       subway                625  

In [33]:
# Playing around with data
import random

boroughs = df["borough"].unique()
print(boroughs)

# filtered_df = df[(df['borough'] == boroughs[random.randint(0, len(boroughs)-1)])]
borough_filtered = df[(df['borough'] == "Manhattan")]
borough_filtered

['Brooklyn' 'Manhattan' 'Bronx' 'Queens' 'Staten Island']


Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
47,2024-01-11 01:00:00,subway,231,"Grand St (B,D)",Manhattan,metrocard,Metrocard - Fair Fare,2.0,0.0,40.718266,-73.99375,"{'type': 'Point', 'coordinates': [-73.99375, 4...",2095,749,718
69,2024-01-11 10:00:00,subway,151,"145 St (A,C,B,D)",Manhattan,omny,OMNY - Other,2.0,0.0,40.824783,-73.944214,"{'type': 'Point', 'coordinates': [-73.944214, ...",2095,749,758
71,2024-01-11 11:00:00,subway,605,"168 St (A,C,1)",Manhattan,metrocard,Metrocard - Full Fare,123.0,3.0,40.840717,-73.93956,"{'type': 'Point', 'coordinates': [-73.93956, 4...",2095,749,791
242,2024-03-04 11:00:00,subway,157,"96 St (C,B)",Manhattan,omny,OMNY - Full Fare,126.0,7.0,40.79164,-73.9647,"{'type': 'Point', 'coordinates': [-73.9647, 40...",2095,749,
552,2024-01-25 01:00:00,subway,164,"34 St-Penn Station (A,C,E)",Manhattan,metrocard,Metrocard - Full Fare,30.0,0.0,40.75229,-73.99339,"{'type': 'Point', 'coordinates': [-73.99339, 4...",2095,749,717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4718,2024-02-28 10:00:00,subway,311,86 St (1),Manhattan,metrocard,Metrocard - Full Fare,74.0,0.0,40.788643,-73.97622,"{'type': 'Point', 'coordinates': [-73.97622, 4...",2095,749,751
4747,2024-01-19 03:00:00,subway,304,145 St (1),Manhattan,metrocard,Metrocard - Full Fare,4.0,0.0,40.82655,-73.95036,"{'type': 'Point', 'coordinates': [-73.95036, 4...",2095,749,758
4765,2024-02-28 16:00:00,subway,328,WTC Cortlandt (1),Manhattan,metrocard,Metrocard - Unlimited 7-Day,119.0,0.0,40.711834,-74.01219,"{'type': 'Point', 'coordinates': [-74.01219, 4...",2095,749,722
4786,2024-02-28 19:00:00,subway,477,72 St (Q),Manhattan,metrocard,Metrocard - Full Fare,178.0,1.0,40.7688,-73.95843,"{'type': 'Point', 'coordinates': [-73.95843, 4...",2095,749,744


In [35]:
station_filted = df[(df['station_complex'] == "7 Av (E,B,D)")]
station_filted

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
2460,2024-01-27 12:00:00,subway,277,"7 Av (E,B,D)",Manhattan,metrocard,Metrocard - Seniors & Disability,4.0,0.0,40.762863,-73.981636,"{'type': 'Point', 'coordinates': [-73.981636, ...",2095,749,742
4801,2024-01-15 20:00:00,subway,277,"7 Av (E,B,D)",Manhattan,metrocard,Metrocard - Unlimited 30-Day,28.0,0.0,40.762863,-73.981636,"{'type': 'Point', 'coordinates': [-73.981636, ...",2095,749,742


In [42]:
condensed_ridership = df.groupby(['station_complex_id', 'station_complex','transit_timestamp'])['ridership'].sum().reset_index()
condensed_ridership[condensed_ridership['station_complex'] == "34 St-Penn Station (A,C,E)"]

Unnamed: 0,station_complex_id,station_complex,transit_timestamp,ridership
56,164,"34 St-Penn Station (A,C,E)",2024-01-25 01:00:00,30.0


In [52]:
# Preprocessing
final_df = df.dropna(subset=['ridership'])
final_df['hour'] = final_df['transit_timestamp'].dt.hour
final_df['day_of_week'] = final_df['transit_timestamp'].dt.dayofweek
final_df['month'] = final_df['transit_timestamp'].dt.month
final_df


Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc,hour,day_of_week,month
0,2023-01-01 01:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,1.0,0.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901,1,6,1
1,2023-07-26 05:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,28.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,5,2,7
2,2023-03-27 20:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,23.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,20,0,3
3,2023-09-11 22:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,77.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,22,0,9
4,2023-02-22 19:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,238.0,2.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,19,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2023-06-30 08:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,31.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,8,4,6
4996,2023-01-20 18:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,435.0,4.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,18,4,1
4997,2023-04-18 09:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Seniors & Disability,2.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,9,1,4
4998,2023-08-08 15:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,222.0,1.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,15,1,8


In [57]:
# Extracting relavent features and target variable
X = final_df[['hour', 'day_of_week', 'month']]
y = final_df['ridership']

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [59]:
predictions = model.predict(X_test)

In [60]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Root Mean Square Error: {rmse}")

Root Mean Square Error: 160.49605087296024


In [61]:
# Predictions
def predict_ridership(hour, day_of_week, month):
    input_data = np.array([[hour, day_of_week, month]])
    predicted_ridership = model.predict(input_data)
    return predicted_ridership[0]

hour = 8  # 8 AM
day_of_week = 4  # Friday
month = 1  # January
predicted_ridership = predict_ridership(hour, day_of_week, month)
print(f"Predicted ridership: {predicted_ridership}")

Predicted ridership: 80.5425613275613




In [66]:
fridays_in_january_at_8am = final_df[(final_df['day_of_week'] == 4) & (final_df['month'] == 1) & (final_df['hour'] == 8)]
fridays_in_january_at_8am

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc,hour,day_of_week,month
818,2024-01-26 08:00:00,tram,TRAM2,RI Tramway (Roosevelt),Manhattan,metrocard,Metrocard - Unlimited 7-Day,21.0,0.0,40.75734,-73.95412,"{'type': 'Point', 'coordinates': [-73.95412, 4...",2095,749,800,8,4,1
1721,2023-01-20 08:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,4.0,0.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901,8,4,1
1969,2023-01-27 08:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Seniors & Disability,1.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,8,4,1
2313,2024-01-26 08:00:00,subway,434,"3 Av-149 St (2,5)",Bronx,metrocard,Metrocard - Unlimited 30-Day,88.0,0.0,40.81611,-73.917755,"{'type': 'Point', 'coordinates': [-73.917755, ...",2032,307,651,8,4,1
2776,2023-01-27 08:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,9.0,0.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901,8,4,1
3973,2023-01-13 08:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,10.0,1.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901,8,4,1
3998,2024-01-26 08:00:00,subway,138,Canarsie-Rockaway Pkwy (L),Brooklyn,metrocard,Metrocard - Full Fare,49.0,8.0,40.646652,-73.90185,"{'type': 'Point', 'coordinates': [-73.90185, 4...",2090,894,819,8,4,1
4001,2024-01-26 08:00:00,subway,41,"7 Av (B,Q)",Brooklyn,metrocard,Metrocard - Unlimited 7-Day,8.0,0.0,40.67705,-73.97237,"{'type': 'Point', 'coordinates': [-73.97237, 4...",2090,894,895,8,4,1
4003,2024-01-26 08:00:00,subway,44,"Church Av (B,Q)",Brooklyn,metrocard,Metrocard - Students,72.0,7.0,40.650528,-73.96298,"{'type': 'Point', 'coordinates': [-73.96298, 4...",2090,894,810,8,4,1
4501,2023-01-06 08:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,332.0,1.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901,8,4,1


In [69]:
final_df.to_json("HistoricalRidership.json", orient='records', lines=True)

print("DataFrame has been successfully saved as JSON.")

DataFrame has been successfully saved as JSON.
