In [56]:
import pandas as pd
import numpy as np
from sodapy import Socrata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [11]:
# Initialize the Socrata client and fetch data
client = Socrata("data.ny.gov", None)
results = client.get("wujg-7c2s", limit=5000)
results




[{'transit_timestamp': '2023-01-01T01:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '284',
  'station_complex': 'Nassau Av (G)',
  'borough': 'Brooklyn',
  'payment_method': 'metrocard',
  'fare_class_category': 'Metrocard - Fair Fare',
  'ridership': '1.0',
  'transfers': '0.0',
  'latitude': '40.72463607788086',
  'longitude': '-73.95127868652344',
  'georeference': {'type': 'Point',
   'coordinates': [-73.95127868652344, 40.72463607788086]},
  ':@computed_region_kjdx_g34t': '2090',
  ':@computed_region_yamh_8v7k': '894',
  ':@computed_region_wbg7_3whc': '901'},
 {'transit_timestamp': '2023-07-26T05:00:00.000',
  'transit_mode': 'subway',
  'station_complex_id': '283',
  'station_complex': 'Greenpoint Av (G)',
  'borough': 'Brooklyn',
  'payment_method': 'omny',
  'fare_class_category': 'OMNY - Full Fare',
  'ridership': '28.0',
  'transfers': '0.0',
  'latitude': '40.731353759765625',
  'longitude': '-73.95445251464844',
  'georeference': {'type': 'Point',
   'coor

In [15]:

# Convert results to DataFrame
df = pd.DataFrame.from_records(results)

# Convert 'transit_timestamp' to datetime
df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])

# Convert 'ridership' to numeric
df['ridership'] = pd.to_numeric(df['ridership'], errors='coerce')

print(df.head())
df

    transit_timestamp transit_mode station_complex_id    station_complex  \
0 2023-01-01 01:00:00       subway                284      Nassau Av (G)   
1 2023-07-26 05:00:00       subway                283  Greenpoint Av (G)   
2 2023-03-27 20:00:00       subway                283  Greenpoint Av (G)   
3 2023-09-11 22:00:00       subway                283  Greenpoint Av (G)   
4 2023-02-22 19:00:00       subway                283  Greenpoint Av (G)   

    borough payment_method          fare_class_category  ridership transfers  \
0  Brooklyn      metrocard        Metrocard - Fair Fare        1.0       0.0   
1  Brooklyn           omny             OMNY - Full Fare       28.0       0.0   
2  Brooklyn      metrocard  Metrocard - Unlimited 7-Day       23.0       0.0   
3  Brooklyn           omny             OMNY - Full Fare       77.0       0.0   
4  Brooklyn           omny             OMNY - Full Fare      238.0       2.0   

             latitude           longitude  \
0   40.7246360778

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
0,2023-01-01 01:00:00,subway,284,Nassau Av (G),Brooklyn,metrocard,Metrocard - Fair Fare,1.0,0.0,40.72463607788086,-73.95127868652344,"{'type': 'Point', 'coordinates': [-73.95127868...",2090,894,901
1,2023-07-26 05:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,28.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
2,2023-03-27 20:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,23.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
3,2023-09-11 22:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,77.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4,2023-02-22 19:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,238.0,2.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2023-06-30 08:00:00,subway,283,Greenpoint Av (G),Brooklyn,metrocard,Metrocard - Unlimited 7-Day,31.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4996,2023-01-20 18:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,435.0,4.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4997,2023-04-18 09:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Seniors & Disability,2.0,0.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901
4998,2023-08-08 15:00:00,subway,283,Greenpoint Av (G),Brooklyn,omny,OMNY - Full Fare,222.0,1.0,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",2090,894,901


In [8]:
def get_average_ridership_by_station_and_time(dataframe, station_id, query_time):
    """
    Retrieve the average ridership for a given station and time, across the same day of the week.
    
    :param dataframe: DataFrame containing the MTA data
    :param station_id: ID or name of the station complex
    :param query_time: datetime or string representing the time to query (e.g., '08:00:00')
    :return: Average ridership for the given time and day of the week
    """
    # Ensure query_time is a datetime object for consistency
    if isinstance(query_time, str):
        query_time = pd.to_datetime(query_time)

    # Filter data for the given station
    station_filtered = dataframe[
        (dataframe['station_complex_id'] == station_id) |
        (dataframe['station_complex'].str.contains(station_id, case=False, na=False))
    ].copy()

    # Safely create new columns using loc
    station_filtered.loc[:, 'day_of_week'] = station_filtered['transit_timestamp'].dt.dayofweek
    station_filtered.loc[:, 'hour'] = station_filtered['transit_timestamp'].dt.hour

    # Filter by day of the week and hour
    day_hour_filtered = station_filtered[
        (station_filtered['day_of_week'] == query_time.dayofweek) &
        (station_filtered['hour'] == query_time.hour)
    ]

    # Calculate the average ridership
    average_ridership = day_hour_filtered['ridership'].mean()

    return average_ridership



In [9]:
# Example usage
station_complex_id = '283'  # Example Station ID or part of the name
query_time = '2021-01-01 08:00:00'  # Example query time
average_ridership = get_average_ridership_by_station_and_time(df, station_complex_id, query_time)
print(f"Average ridership for station {station_complex_id} on the same day of the week and time: {average_ridership}")

Average ridership for station 283 on the same day of the week and time: 33.666666666666664


In [47]:
earliest_timestamp = df['transit_timestamp'].min()
earliest_row = df[df['transit_timestamp'] == earliest_timestamp]

# Get the latest transit_timestamp
latest_timestamp = df['transit_timestamp'].max()
latest_row = df[df['transit_timestamp'] == latest_timestamp]

print("Earliest time:")
print(earliest_row["transit_timestamp"].unique()[0])
print("\nLatest time:")
print(latest_row["transit_timestamp"].unique()[0])

print("\nEarliest row(s):")
print(earliest_row)
print("\nLatest row(s):")
print(latest_row)

Earliest time:
2023-01-01 00:00:00

Latest time:
2024-03-28 18:00:00

Earliest row(s):
     transit_timestamp transit_mode station_complex_id    station_complex  \
3706        2023-01-01       subway                283  Greenpoint Av (G)   

       borough payment_method fare_class_category  ridership transfers  \
3706  Brooklyn           omny    OMNY - Full Fare       90.0       0.0   

                latitude           longitude  \
3706  40.731353759765625  -73.95445251464844   

                                           georeference  \
3706  {'type': 'Point', 'coordinates': [-73.95445251...   

     :@computed_region_kjdx_g34t :@computed_region_yamh_8v7k  \
3706                        2090                         894   

     :@computed_region_wbg7_3whc  
3706                         901  

Latest row(s):
       transit_timestamp transit_mode station_complex_id  \
2976 2024-03-28 18:00:00       subway                 45   
2978 2024-03-28 18:00:00       subway                625  

In [133]:
# Playing around with data
import random

boroughs = df["borough"].unique()
print(boroughs)

# filtered_df = df[(df['borough'] == boroughs[random.randint(0, len(boroughs)-1)])]
borough_filtered = df[(df['borough'] == "Manhattan")]
borough_filtered.head(30)

['Brooklyn' 'Manhattan' 'Bronx' 'Queens' 'Staten Island']


Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
47,2024-01-11 01:00:00,subway,231,"Grand St (B,D)",Manhattan,metrocard,Metrocard - Fair Fare,2.0,0.0,40.718266,-73.99375,"{'type': 'Point', 'coordinates': [-73.99375, 4...",2095,749,718.0
69,2024-01-11 10:00:00,subway,151,"145 St (A,C,B,D)",Manhattan,omny,OMNY - Other,2.0,0.0,40.824783,-73.944214,"{'type': 'Point', 'coordinates': [-73.944214, ...",2095,749,758.0
71,2024-01-11 11:00:00,subway,605,"168 St (A,C,1)",Manhattan,metrocard,Metrocard - Full Fare,123.0,3.0,40.840717,-73.93956,"{'type': 'Point', 'coordinates': [-73.93956, 4...",2095,749,791.0
242,2024-03-04 11:00:00,subway,157,"96 St (C,B)",Manhattan,omny,OMNY - Full Fare,126.0,7.0,40.79164,-73.9647,"{'type': 'Point', 'coordinates': [-73.9647, 40...",2095,749,
552,2024-01-25 01:00:00,subway,164,"34 St-Penn Station (A,C,E)",Manhattan,metrocard,Metrocard - Full Fare,30.0,0.0,40.75229,-73.99339,"{'type': 'Point', 'coordinates': [-73.99339, 4...",2095,749,717.0
580,2024-01-25 10:00:00,subway,601,"14 St (F,M,1,2,3)/6 Av (L)",Manhattan,metrocard,Metrocard - Unlimited 30-Day,78.0,0.0,40.738228,-73.99621,"{'type': 'Point', 'coordinates': [-73.99621, 4...",2095,749,735.0
596,2024-01-25 10:00:00,subway,614,"59 St-Columbus Circle (A,B,C,D,1)",Manhattan,metrocard,Metrocard - Seniors & Disability,88.0,10.0,40.768295,-73.981735,"{'type': 'Point', 'coordinates': [-73.981735, ...",2095,749,
598,2024-01-25 11:00:00,subway,313,"72 St (1,2,3)",Manhattan,omny,OMNY - Full Fare,563.0,10.0,40.778454,-73.98197,"{'type': 'Point', 'coordinates': [-73.98197, 4...",2095,749,750.0
622,2024-01-25 15:00:00,subway,314,66 St-Lincoln Center (1),Manhattan,omny,OMNY - Seniors & Disability,11.0,1.0,40.77344,-73.98221,"{'type': 'Point', 'coordinates': [-73.98221, 4...",2095,749,750.0
657,2024-01-11 09:00:00,subway,169,"Canal St (A,C,E)",Manhattan,metrocard,Metrocard - Unlimited 30-Day,24.0,0.0,40.720825,-74.005226,"{'type': 'Point', 'coordinates': [-74.005226, ...",2095,749,737.0


In [35]:
station_filted = df[(df['station_complex'] == "7 Av (E,B,D)")]
station_filted

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,station_complex,borough,payment_method,fare_class_category,ridership,transfers,latitude,longitude,georeference,:@computed_region_kjdx_g34t,:@computed_region_yamh_8v7k,:@computed_region_wbg7_3whc
2460,2024-01-27 12:00:00,subway,277,"7 Av (E,B,D)",Manhattan,metrocard,Metrocard - Seniors & Disability,4.0,0.0,40.762863,-73.981636,"{'type': 'Point', 'coordinates': [-73.981636, ...",2095,749,742
4801,2024-01-15 20:00:00,subway,277,"7 Av (E,B,D)",Manhattan,metrocard,Metrocard - Unlimited 30-Day,28.0,0.0,40.762863,-73.981636,"{'type': 'Point', 'coordinates': [-73.981636, ...",2095,749,742


In [77]:
# Preprocessing
aggregations = {
    'transit_timestamp': 'first',
    'ridership': 'sum',
    'transfers': 'sum',  
    # Columns we are not aggregating
    'transit_mode': 'first',
    'borough': 'first',
    'latitude': 'first',
    'longitude': 'first',
    'georeference': 'first'
}

# Perform the groupby and aggregate
condensed_df = df.groupby(['station_complex_id', 'station_complex', 'transit_timestamp'], as_index=False).agg(aggregations)
condensed_df

Unnamed: 0,station_complex_id,station_complex,transit_timestamp,ridership,transfers,transit_mode,borough,latitude,longitude,georeference
0,1,"Astoria-Ditmars Blvd (N,W)",2024-01-26 22:00:00,10.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
1,1,"Astoria-Ditmars Blvd (N,W)",2024-01-28 23:00:00,11.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
2,1,"Astoria-Ditmars Blvd (N,W)",2024-02-28 09:00:00,651.0,53.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
3,10,"49 St (N,R,W)",2024-02-28 04:00:00,1.0,0.0,subway,Manhattan,40.7599,-73.98414,"{'type': 'Point', 'coordinates': [-73.98414, 4..."
4,101,"Marcy Av (M,J,Z)",2024-01-11 07:00:00,30.0,1.0,subway,Brooklyn,40.70836,-73.957756,"{'type': 'Point', 'coordinates': [-73.957756, ..."
...,...,...,...,...,...,...,...,...,...,...
4283,TRAM1,RI Tramway (Manhattan),2024-01-27 10:00:00,7.0,4.0,tram,Manhattan,40.761337,-73.96416,"{'type': 'Point', 'coordinates': [-73.96416, 4..."
4284,TRAM2,RI Tramway (Roosevelt),2024-01-26 08:00:00,21.0,0.0,tram,Manhattan,40.75734,-73.95412,"{'type': 'Point', 'coordinates': [-73.95412, 4..."
4285,TRAM2,RI Tramway (Roosevelt),2024-01-29 00:00:00,2.0,0.0,tram,Manhattan,40.75734,-73.95412,"{'type': 'Point', 'coordinates': [-73.95412, 4..."
4286,TRAM2,RI Tramway (Roosevelt),2024-02-14 12:00:00,1.0,0.0,tram,Manhattan,40.75734,-73.95412,"{'type': 'Point', 'coordinates': [-73.95412, 4..."


In [121]:
# Filtering for subway data
subway_df = condensed_df[condensed_df['transit_mode'] == 'subway']
subway_df

Unnamed: 0,station_complex_id,station_complex,transit_timestamp,ridership,transfers,transit_mode,borough,latitude,longitude,georeference
0,1,"Astoria-Ditmars Blvd (N,W)",2024-01-26 22:00:00,10.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
1,1,"Astoria-Ditmars Blvd (N,W)",2024-01-28 23:00:00,11.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
2,1,"Astoria-Ditmars Blvd (N,W)",2024-02-28 09:00:00,651.0,53.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4..."
3,10,"49 St (N,R,W)",2024-02-28 04:00:00,1.0,0.0,subway,Manhattan,40.7599,-73.98414,"{'type': 'Point', 'coordinates': [-73.98414, 4..."
4,101,"Marcy Av (M,J,Z)",2024-01-11 07:00:00,30.0,1.0,subway,Brooklyn,40.70836,-73.957756,"{'type': 'Point', 'coordinates': [-73.957756, ..."
...,...,...,...,...,...,...,...,...,...,...
4277,95,"Gates Av (J,Z)",2024-02-28 06:00:00,19.0,1.0,subway,Brooklyn,40.68963,-73.92227,"{'type': 'Point', 'coordinates': [-73.92227, 4..."
4278,96,Kosciuszko St (J),2024-01-11 02:00:00,1.0,0.0,subway,Brooklyn,40.69334,-73.92882,"{'type': 'Point', 'coordinates': [-73.92882, 4..."
4279,96,Kosciuszko St (J),2024-01-11 05:00:00,5.0,0.0,subway,Brooklyn,40.69334,-73.92882,"{'type': 'Point', 'coordinates': [-73.92882, 4..."
4280,97,"Myrtle Av (M,J,Z)",2024-01-29 00:00:00,3.0,0.0,subway,Brooklyn,40.69721,-73.93565,"{'type': 'Point', 'coordinates': [-73.93565, 4..."


In [122]:
final_df2 = subway_df.dropna(subset=['ridership'])
final_df2['station_complex_id'] = final_df2['station_complex_id'].astype(int)

# Feature Engineering
final_df2['hour'] = final_df2['transit_timestamp'].dt.hour
final_df2['day_of_week'] = final_df2['transit_timestamp'].dt.dayofweek
final_df2


Unnamed: 0,station_complex_id,station_complex,transit_timestamp,ridership,transfers,transit_mode,borough,latitude,longitude,georeference,hour,day_of_week
0,1,"Astoria-Ditmars Blvd (N,W)",2024-01-26 22:00:00,10.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4...",22,4
1,1,"Astoria-Ditmars Blvd (N,W)",2024-01-28 23:00:00,11.0,0.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4...",23,6
2,1,"Astoria-Ditmars Blvd (N,W)",2024-02-28 09:00:00,651.0,53.0,subway,Queens,40.775036,-73.91203,"{'type': 'Point', 'coordinates': [-73.91203, 4...",9,2
3,10,"49 St (N,R,W)",2024-02-28 04:00:00,1.0,0.0,subway,Manhattan,40.7599,-73.98414,"{'type': 'Point', 'coordinates': [-73.98414, 4...",4,2
4,101,"Marcy Av (M,J,Z)",2024-01-11 07:00:00,30.0,1.0,subway,Brooklyn,40.70836,-73.957756,"{'type': 'Point', 'coordinates': [-73.957756, ...",7,3
...,...,...,...,...,...,...,...,...,...,...,...,...
4277,95,"Gates Av (J,Z)",2024-02-28 06:00:00,19.0,1.0,subway,Brooklyn,40.68963,-73.92227,"{'type': 'Point', 'coordinates': [-73.92227, 4...",6,2
4278,96,Kosciuszko St (J),2024-01-11 02:00:00,1.0,0.0,subway,Brooklyn,40.69334,-73.92882,"{'type': 'Point', 'coordinates': [-73.92882, 4...",2,3
4279,96,Kosciuszko St (J),2024-01-11 05:00:00,5.0,0.0,subway,Brooklyn,40.69334,-73.92882,"{'type': 'Point', 'coordinates': [-73.92882, 4...",5,3
4280,97,"Myrtle Av (M,J,Z)",2024-01-29 00:00:00,3.0,0.0,subway,Brooklyn,40.69721,-73.93565,"{'type': 'Point', 'coordinates': [-73.93565, 4...",0,0


In [123]:
# Select relevant features for the model, now excluding 'month'
X = final_df2[['station_complex_id', 'day_of_week', 'hour']]
y = final_df2['ridership']

In [124]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

In [163]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Root Mean Square Error: {rmse}")

def predict_ridership(station_complex_id, day_of_week, hour):
    # Format the input features as a DataFrame
    input_features = pd.DataFrame({
        'station_complex_id': [station_complex_id],
        'day_of_week': [day_of_week],
        'hour': [hour]
    })
    # Predict ridership
    predicted_ridership = model.predict(input_features)
    return predicted_ridership[0]

# Example usage
station_id_example = 283  # Replace with an actual station_complex_id
day_of_week_example = 6  # Friday
hour_example = 3  # 8 AM
prediction = predict_ridership(station_id_example, day_of_week_example, hour_example)
print(f"Predicted ridership for station ID {station_id_example} on day {day_of_week_example} at hour {hour_example}: {prediction}")

Root Mean Square Error: 156.22188909650438
Predicted ridership for station ID 283 on day 6 at hour 3: 9.06974714273515


In [164]:
actual_data = final_df2[(final_df2['station_complex_id'] == 283) & (final_df2['day_of_week'] == 6) & (final_df2['hour'] == 3)]
actual_data['ridership'].mean()

8.166666666666666

In [138]:
final_df2[final_df2["station_complex_id"] == 283]

Unnamed: 0,station_complex_id,station_complex,transit_timestamp,ridership,transfers,transit_mode,borough,latitude,longitude,georeference,hour,day_of_week
172,283,Greenpoint Av (G),2023-01-01 00:00:00,90.0,0.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",0,6
173,283,Greenpoint Av (G),2023-01-01 01:00:00,10.0,0.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",1,6
174,283,Greenpoint Av (G),2023-01-01 04:00:00,39.0,1.00.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",4,6
175,283,Greenpoint Av (G),2023-01-01 08:00:00,39.0,0.00.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",8,6
176,283,Greenpoint Av (G),2023-01-01 11:00:00,26.0,0.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",11,6
...,...,...,...,...,...,...,...,...,...,...,...,...
3799,283,Greenpoint Av (G),2023-12-07 21:00:00,150.0,0.01.0,subway,Brooklyn,40.731354,-73.95445,"{'type': 'Point', 'coordinates': [-73.95445, 4...",21,3
3800,283,Greenpoint Av (G),2023-12-07 22:00:00,131.0,0.0,subway,Brooklyn,40.731353759765625,-73.95445251464844,"{'type': 'Point', 'coordinates': [-73.95445251...",22,3
3801,283,Greenpoint Av (G),2023-12-13 04:00:00,1.0,0.0,subway,Brooklyn,40.731354,-73.95445,"{'type': 'Point', 'coordinates': [-73.95445, 4...",4,2
3802,283,Greenpoint Av (G),2023-12-13 10:00:00,282.0,1.0,subway,Brooklyn,40.731354,-73.95445,"{'type': 'Point', 'coordinates': [-73.95445, 4...",10,2


In [129]:
errors = abs(predictions - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
mape

Accuracy: -1076.61 %.


1188      178.137149
2299       36.101211
4139     1367.500000
3989      145.090909
3879       51.300595
            ...     
1391       49.237900
3053       20.422139
203     27365.145625
881       331.890688
2456       37.516653
Name: ridership, Length: 856, dtype: float64

In [152]:
# Stations with the most data - NOT the most ridership
station_counts = final_df2['station_complex_id'].value_counts().reset_index()
station_names = final_df2[['station_complex_id', 'station_complex']].drop_duplicates()
station_counts_with_names = station_counts.merge(station_names, on='station_complex_id', how='left')

print(station_counts_with_names.head(25))


    station_complex_id  count                       station_complex
0                  283   3632                     Greenpoint Av (G)
1                  284    136                         Nassau Av (G)
2                  293      6         Van Cortlandt Park-242 St (1)
3                  146      5                            181 St (A)
4                  280      5                  Jamaica-Van Wyck (E)
5                  605      5                        168 St (A,C,1)
6                  237      5                      Carroll St (F,G)
7                  434      5                     3 Av-149 St (2,5)
8                  380      4  Bedford Park Blvd-Lehman College (4)
9                  336      4                         Hoyt St (2,3)
10                  38      4                             86 St (R)
11                 151      4                      145 St (A,C,B,D)
12                 314      4              66 St-Lincoln Center (1)
13                  55      4                  B

In [161]:
top_10_station_ids = station_counts_with_names['station_complex_id'].head(10).tolist()
top_10_station_ids

[283, 284, 293, 146, 280, 605, 237, 434, 380, 336]

In [160]:
# Repeated trials testing

def test_ridership_predictions(station_ids, days_of_week, hours):
    results = []

    for station_id in station_ids:
        for day in days_of_week:
            for hour in hours:
                # Predict ridership
                prediction = predict_ridership(station_id, day, hour)

                # Extract actual data
                actual_data = final_df2[
                    (final_df2['station_complex_id'] == station_id) &
                    (final_df2['day_of_week'] == day) &
                    (final_df2['hour'] == hour)
                ]

                # Compute actual mean and median ridership
                actual_mean = actual_data['ridership'].mean()
                actual_median = actual_data['ridership'].median()

                # Calculate the percentage error between predicted and actual mean
                percentage_error_mean = 100 * abs(prediction - actual_mean) / actual_mean if actual_mean > 0 else None
                # Calculate the percentage error between predicted and actual median
                percentage_error_median = 100 * abs(prediction - actual_median) / actual_median if actual_median > 0 else None

                # Record the results
                results.append({
                    'Station ID': station_id,
                    'Day of Week': day,
                    'Hour': hour,
                    'Predicted Ridership': prediction,
                    'Actual Mean Ridership': actual_mean,
                    'Actual Median Ridership': actual_median,
                    'Percentage Error (Mean)': percentage_error_mean,
                    'Percentage Error (Median)': percentage_error_median
                })

                # Print the results for each prediction
                print(f"Station ID {station_id}, Day {day}, Hour {hour}")
                print(f"Predicted Ridership: {prediction}")
                print(f"Actual Mean Ridership: {actual_mean}")
                print(f"Actual Median Ridership: {actual_median}")
                print(f"Percentage Error (Mean): {percentage_error_mean}%")
                print(f"Percentage Error (Median): {percentage_error_median}%")
                print('-------------------------------------')

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Compute overall percentage error
    overall_percentage_error_mean = results_df['Percentage Error (Mean)'].mean()
    overall_percentage_error_median = results_df['Percentage Error (Median)'].mean()

    print(f"Overall Percentage Error (Mean): {overall_percentage_error_mean}%")
    print(f"Overall Percentage Error (Median): {overall_percentage_error_median}%")

    return results_df

# Example usage with a list of station IDs, days, and hours
station_ids = [283, 1, 10]  # Replace with your actual list of station IDs
days_of_week = [0, 1, 2, 3, 4, 5, 6]  # 0 = Monday, 6 = Sunday
hours = [8, 14, 20]  # Example set of hours in 24-hour format

# Call the test function
test_results_df = test_ridership_predictions(station_ids, days_of_week, hours)


Station ID 283, Day 0, Hour 8
Predicted Ridership: 328.31828913217316
Actual Mean Ridership: 308.82608695652175
Actual Median Ridership: 460.0
Percentage Error (Mean): 6.311708433619351%
Percentage Error (Median): 28.626458884310182%
-------------------------------------
Station ID 283, Day 0, Hour 14
Predicted Ridership: 136.03178649931658
Actual Mean Ridership: 131.45
Actual Median Ridership: 163.0
Percentage Error (Mean): 3.485573601610189%
Percentage Error (Median): 16.544916258087987%
-------------------------------------
Station ID 283, Day 0, Hour 20
Predicted Ridership: 81.1809486734157
Actual Mean Ridership: 84.6
Actual Median Ridership: 104.5
Percentage Error (Mean): 4.041431828113825%
Percentage Error (Median): 22.31488165223378%
-------------------------------------
Station ID 283, Day 1, Hour 8
Predicted Ridership: 427.5483520574217
Actual Mean Ridership: 369.75
Actual Median Ridership: 308.0
Percentage Error (Mean): 15.631738217017364%
Percentage Error (Median): 38.814400

In [162]:

def test_ridership_predictions(station_ids, days_of_week, hours):
    results = []

    for station_id in station_ids:
        for day in days_of_week:
            for hour in hours:
                # Predict ridership
                prediction = predict_ridership(station_id, day, hour)

                # Extract actual data
                actual_data = final_df2[
                    (final_df2['station_complex_id'] == station_id) &
                    (final_df2['day_of_week'] == day) &
                    (final_df2['hour'] == hour)
                ]

                # Compute actual mean and median ridership
                actual_mean = actual_data['ridership'].mean()
                actual_median = actual_data['ridership'].median()

                # Calculate the percentage error between predicted and actual mean
                percentage_error_mean = 100 * abs(prediction - actual_mean) / actual_mean if actual_mean > 0 else None
                # Calculate the percentage error between predicted and actual median
                percentage_error_median = 100 * abs(prediction - actual_median) / actual_median if actual_median > 0 else None

                # Record the results
                results.append({
                    'Station ID': station_id,
                    'Day of Week': day,
                    'Hour': hour,
                    'Predicted Ridership': prediction,
                    'Actual Mean Ridership': actual_mean,
                    'Actual Median Ridership': actual_median,
                    'Percentage Error (Mean)': percentage_error_mean,
                    'Percentage Error (Median)': percentage_error_median
                })

                # Print the results for each prediction
                print(f"Station ID {station_id}, Day {day}, Hour {hour}")
                print(f"Predicted Ridership: {prediction}")
                print(f"Actual Mean Ridership: {actual_mean}")
                print(f"Actual Median Ridership: {actual_median}")
                print(f"Percentage Error (Mean): {percentage_error_mean}%")
                print(f"Percentage Error (Median): {percentage_error_median}%")
                print('-------------------------------------')

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Compute overall percentage error
    overall_percentage_error_mean = results_df['Percentage Error (Mean)'].mean()
    overall_percentage_error_median = results_df['Percentage Error (Median)'].mean()

    print(f"Overall Percentage Error (Mean): {overall_percentage_error_mean}%")
    print(f"Overall Percentage Error (Median): {overall_percentage_error_median}%")

    return results_df

# Example usage with a list of station IDs, days, and hours
station_ids = top_10_station_ids  # Replace with your actual list of station IDs
days_of_week = [0, 1, 2, 3, 4, 5, 6]  # 0 = Monday, 6 = Sunday
hours = [8, 14, 20]  # Example set of hours in 24-hour format

# Call the test function
test_results_df = test_ridership_predictions(station_ids, days_of_week, hours)

Station ID 283, Day 0, Hour 8
Predicted Ridership: 328.31828913217316
Actual Mean Ridership: 308.82608695652175
Actual Median Ridership: 460.0
Percentage Error (Mean): 6.311708433619351%
Percentage Error (Median): 28.626458884310182%
-------------------------------------
Station ID 283, Day 0, Hour 14
Predicted Ridership: 136.03178649931658
Actual Mean Ridership: 131.45
Actual Median Ridership: 163.0
Percentage Error (Mean): 3.485573601610189%
Percentage Error (Median): 16.544916258087987%
-------------------------------------
Station ID 283, Day 0, Hour 20
Predicted Ridership: 81.1809486734157
Actual Mean Ridership: 84.6
Actual Median Ridership: 104.5
Percentage Error (Mean): 4.041431828113825%
Percentage Error (Median): 22.31488165223378%
-------------------------------------
Station ID 283, Day 1, Hour 8
Predicted Ridership: 427.5483520574217
Actual Mean Ridership: 369.75
Actual Median Ridership: 308.0
Percentage Error (Mean): 15.631738217017364%
Percentage Error (Median): 38.814400

In [167]:
from joblib import dump

# Save the model as a file named 'random_forest_model.joblib' in your current directory
dump(model, 'models/test1.joblib')

['models/test1.joblib']

In [169]:
final_df2.to_json("HistoricalRidership.json", orient='records', lines=True)

print("DataFrame has been successfully saved as JSON.")

DataFrame has been successfully saved as JSON.
