In [2]:
import pandas as pd
from joblib import load

# Load the model from the file
model = load('../models/fiveMillionDatapointModel.joblib')

In [3]:
import numpy as np
from sodapy import Socrata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [15]:
# Initialize the Socrata client and fetch data
client = Socrata("data.ny.gov", "UySBT3QA7VB8OpdyYVybWrBiN")
results = client.get("wujg-7c2s", limit=500000)
results

[{'transit_timestamp': '2024-09-17T05:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '392',
  'station_complex': '125 St (4,5,6)',
  'borough': 'Manhattan',
  'payment_method': 'metrocard',
  'fare_class_category': 'Metrocard - Fair Fare',
  'ridership': '13.0',
  'transfers': '0.0',
  'latitude': '40.80414',
  'longitude': '-73.93759',
  'georeference': {'type': 'Point', 'coordinates': [-73.93759, 40.80414]},
  ':@computed_region_kjdx_g34t': '2095',
  ':@computed_region_yamh_8v7k': '749',
  ':@computed_region_wbg7_3whc': '794'},
 {'transit_timestamp': '2024-05-27T06:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '419',
  'station_complex': '225 St (2,5)',
  'borough': 'Bronx',
  'payment_method': 'metrocard',
  'fare_class_category': 'Metrocard - Other',
  'ridership': '1.0',
  'transfers': '0.0',
  'latitude': '40.888023',
  'longitude': '-73.860344',
  'georeference': {'type': 'Point', 'coordinates': [-73.860344, 40.888023]},
  ':@computed_region_kj

In [16]:
# Convert results to DataFrame
df = pd.DataFrame.from_records(results)
# Convert 'transit_timestamp' to datetime
df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])
# Convert 'ridership' to numeric
df['ridership'] = pd.to_numeric(df['ridership'], errors='coerce')
df

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
0,2024-09-17 05:00:00,subway,392,"125 St (4,5,6)",Manhattan,metrocard,Metrocard - Fair Fare,13.0,0.0,40.80414,-73.93759,"{'type': 'Point', 'coordinates': [-73.93759, 4...",2095,749,794
1,2024-05-27 06:00:00,subway,419,"225 St (2,5)",Bronx,metrocard,Metrocard - Other,1.0,0.0,40.888023,-73.860344,"{'type': 'Point', 'coordinates': [-73.860344, ...",2032,307,679
2,2024-05-27 06:00:00,subway,427,"West Farms Sq-E Tremont Av (2,5)",Bronx,metrocard,Metrocard - Other,6.0,1.0,40.840294,-73.88005,"{'type': 'Point', 'coordinates': [-73.88005, 4...",2032,307,673
3,2024-05-27 06:00:00,subway,47,"Newkirk Plaza (B,Q)",Brooklyn,omny,OMNY - Full Fare,65.0,3.0,40.635082,-73.96279,"{'type': 'Point', 'coordinates': [-73.96279, 4...",2090,894,810
4,2024-05-27 06:00:00,subway,471,34 St-Hudson Yards (7),Manhattan,metrocard,Metrocard - Fair Fare,3.0,0.0,40.755882,-74.00191,"{'type': 'Point', 'coordinates': [-74.00191, 4...",2095,749,717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,2024-08-10 16:00:00,subway,298,207 St (1),Manhattan,metrocard,Metrocard - Fair Fare,16.0,2.0,40.86462,-73.91882,"{'type': 'Point', 'coordinates': [-73.91882, 4...",2095,749,793
499996,2024-08-10 16:00:00,subway,347,Saratoga Av (3),Brooklyn,metrocard,Metrocard - Fair Fare,8.0,0.0,40.661453,-73.91633,"{'type': 'Point', 'coordinates': [-73.91633, 4...",2090,894,890
499997,2024-08-10 16:00:00,subway,369,Elder Av (6),Bronx,metrocard,Metrocard - Seniors & Disability,7.0,0.0,40.828583,-73.87916,"{'type': 'Point', 'coordinates': [-73.87916, 4...",2032,307,709
499998,2024-08-10 16:00:00,subway,384,Burnside Av (4),Bronx,metrocard,Metrocard - Fair Fare,21.0,0.0,40.853455,-73.907684,"{'type': 'Point', 'coordinates': [-73.907684, ...",2032,307,649


In [17]:
# Descending timestamps
ordered_df = df.sort_values(by='transit_timestamp', ascending=False)
ordered_df

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
55423,2024-09-17 23:00:00,subway,4,"Broadway (N,W)",Queens,metrocard,Metrocard - Unlimited 30-Day,2.0,0.0,40.76182,-73.92551,"{'type': 'Point', 'coordinates': [-73.92551, 4...",2137,196,878
108268,2024-09-17 23:00:00,subway,373,E 149 St (6),Bronx,metrocard,Metrocard - Unlimited 7-Day,5.0,0.0,40.81212,-73.9041,"{'type': 'Point', 'coordinates': [-73.9041, 40...",2032,307,651
181484,2024-09-17 23:00:00,subway,119,1 Av (L),Manhattan,metrocard,Metrocard - Fair Fare,17.0,0.0,40.730953,-73.98163,"{'type': 'Point', 'coordinates': [-73.98163, 4...",2095,749,724
95000,2024-09-17 22:00:00,subway,236,"Bergen St (F,G)",Brooklyn,omny,OMNY - Seniors & Disability,1.0,0.0,40.686146,-73.99086,"{'type': 'Point', 'coordinates': [-73.99086, 4...",2090,894,880
135288,2024-09-17 22:00:00,subway,66,18 Av (D),Brooklyn,metrocard,Metrocard - Fair Fare,1.0,0.0,40.607952,-74.00174,"{'type': 'Point', 'coordinates': [-74.00174, 4...",2090,894,892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121650,2024-05-01 01:00:00,subway,393,116 St (6),Manhattan,omny,OMNY - Full Fare,4.0,0.0,40.79863,-73.94162,"{'type': 'Point', 'coordinates': [-73.94162, 4...",2095,749,756
157931,2024-05-01 01:00:00,subway,347,Saratoga Av (3),Brooklyn,metrocard,Metrocard - Other,1.0,0.0,40.661453,-73.91633,"{'type': 'Point', 'coordinates': [-73.91633, 4...",2090,894,890
175513,2024-05-01 00:00:00,subway,51,"Kings Hwy (B,Q)",Brooklyn,metrocard,Metrocard - Other,3.0,0.0,40.60867,-73.95773,"{'type': 'Point', 'coordinates': [-73.95773, 4...",2090,894,812
120881,2024-05-01 00:00:00,subway,103,"Bowery (J,Z)",Manhattan,omny,OMNY - Full Fare,36.0,1.0,40.72028,-73.99391,"{'type': 'Point', 'coordinates': [-73.99391, 4...",2095,749,718


In [5]:
def predict_ridership(station_complex_id, day_of_week, hour):
    # Format the input features as a DataFrame
    input_features = pd.DataFrame({
        'station_complex_id': [station_complex_id],
        'day_of_week': [day_of_week],
        'hour': [hour]
    })
    # Predict ridership
    predicted_ridership = model.predict(input_features)
    return predicted_ridership[0]

# Example usage
station_id_example = 164  # Replace with an actual station_complex_id
day_of_week_example = 6  # Friday
hour_example = 15  # 8 AM
prediction = predict_ridership(station_id_example, day_of_week_example, hour_example)
print(f"Predicted ridership for station ID {station_id_example} on day {day_of_week_example} at hour {hour_example}: {prediction}")

Predicted ridership for station ID 164 on day 6 at hour 15: 1480.4707055659012
