In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping
import pyspark.sql as sp
from pyspark.sql.functions import col, year, to_timestamp, sum as _sum, desc, date_format, countDistinct, collect_set, round, first
from pyspark.sql import Row
import holidays

In [6]:
spark = sp.SparkSession.builder.appName("RidershipPrediction").getOrCreate()

df = spark.read.parquet("/content/drive/MyDrive/mta_top20_joined_1")
df.printSchema()

root
 |-- station_complex: string (nullable = true)
 |-- transit_timestamp: string (nullable = true)
 |-- station_complex_id: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- fare_class_category: string (nullable = true)
 |-- ridership: integer (nullable = true)
 |-- transfers: integer (nullable = true)
 |-- transit_mode: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- temperature_C: string (nullable = true)
 |-- precipitation_mm: string (nullable = true)
 |-- humidity_%: string (nullable = true)



In [7]:
pdf = df.toPandas()

In [8]:
pdf['transit_timestamp'] = pd.to_datetime(pdf['transit_timestamp'])

pdf['hour'] = pdf['transit_timestamp'].dt.hour
pdf['day_of_week'] = pdf['transit_timestamp'].dt.dayofweek
pdf['month'] = pdf['transit_timestamp'].dt.month
pdf['is_weekend'] = pdf['day_of_week'].isin([5, 6]).astype(int)
us_holidays = holidays.US()
pdf['is_holiday'] = pdf['transit_timestamp'].dt.date.apply(lambda x: x in us_holidays).astype(int)

pdf['ridership_lag1'] = pdf.groupby(['station_complex'])['ridership'].shift(1)
pdf['ridership_lag24'] = pdf.groupby(['station_complex'])['ridership'].shift(24)

pdf['temperature_C'] = pd.to_numeric(pdf['temperature_C'], errors='coerce')
pdf['precipitation_mm'] = pd.to_numeric(pdf['precipitation_mm'], errors='coerce')
pdf['humidity_%'] = pd.to_numeric(pdf['humidity_%'], errors='coerce')
pdf['heavy_rain'] = (pdf['precipitation_mm'] > 10).astype(int)
pdf['high_temp'] = (pdf['temperature_C'] > 30).astype(int)

pdf = pdf.fillna(0)

Q1 = pdf['ridership'].quantile(0.25)
Q3 = pdf['ridership'].quantile(0.75)
IQR = Q3 - Q1
pdf = pdf[~((pdf['ridership'] < (Q1 - 1.5 * IQR)) | (pdf['ridership'] > (Q3 + 1.5 * IQR)))]

In [9]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['station_complex', 'station_complex_id', 'fare_class_category', 'payment_method']
encoded_dfs = []

for col in categorical_cols:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(pdf[[col]])
    encoded_df = pd.DataFrame(
        encoded,
        columns=[f"{col}_{cat}".replace(' ', '_').replace('/', '_') for cat in encoder.categories_[0]]
    )
    encoded_dfs.append(encoded_df)

# Concatenate encoded columns back with the rest of the data
pdf_encoded = pd.concat([pdf.reset_index(drop=True)] + encoded_dfs, axis=1)

# Drop original categorical columns
pdf_encoded = pdf_encoded.drop(columns=categorical_cols)

# Remove duplicate columns if any
pdf_encoded = pdf_encoded.loc[:, ~pdf_encoded.columns.duplicated()]


In [10]:
# Scale numerical features
numerical_cols = ['hour', 'day_of_week', 'month', 'ridership_lag1', 'ridership_lag24',
                 'temperature_C', 'humidity_%', 'precipitation_mm']
scaler = StandardScaler()
pdf_encoded[numerical_cols] = scaler.fit_transform(pdf_encoded[numerical_cols])

In [11]:
# Define feature columns
feature_cols = (
    ['hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday',
     'ridership_lag1', 'ridership_lag24', 'heavy_rain', 'high_temp'] +
    ['temperature_C', 'humidity_%', 'precipitation_mm'] +
    [col for col in pdf_encoded.columns if any(prefix in col for prefix in categorical_cols)]
)

# Prepare X and y
X = pdf_encoded[feature_cols]
y = np.log1p(pdf_encoded['ridership'])  # log1p transform


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
# Clean feature names to remove special characters
X_train.columns = X_train.columns.str.replace(r'[^\w\d_]+', '_', regex=True)
X_val.columns = X_val.columns.str.replace(r'[^\w\d_]+', '_', regex=True)


In [14]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'regression',
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }

    model = LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[early_stopping(stopping_rounds=50, verbose=False)]
    )
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))  # Fixed RMSE calculation
    return rmse

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Adjust n_trials based on time available

# Print best parameters
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2025-05-05 15:55:09,123] A new study created in memory with name: no-name-3468ad33-00d5-4bcf-bd6f-bd38c8a62dad


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.117209 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 15:58:41,280] Trial 0 finished with value: 0.4418752249803182 and parameters: {'n_estimators': 1595, 'learning_rate': 0.01765578074199816, 'num_leaves': 125, 'max_depth': 5, 'min_child_samples': 42, 'subsample': 0.9595413767270409, 'colsample_bytree': 0.9360299773038585}. Best is trial 0 with value: 0.4418752249803182.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:01:06,558] Trial 1 finished with value: 0.3276940109554189 and parameters: {'n_estimators': 1033, 'learning_rate': 0.13905188260621404, 'num_leaves': 133, 'max_depth': 14, 'min_child_samples': 54, 'subsample': 0.586155202416271, 'colsample_bytree': 0.7591702397806952}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.177178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:06:07,891] Trial 2 finished with value: 0.34839817787995686 and parameters: {'n_estimators': 1916, 'learning_rate': 0.03833519152935096, 'num_leaves': 69, 'max_depth': 14, 'min_child_samples': 84, 'subsample': 0.6650329080410722, 'colsample_bytree': 0.6158046884422113}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:11:58,866] Trial 3 finished with value: 0.32818218158684065 and parameters: {'n_estimators': 1826, 'learning_rate': 0.2906982174931821, 'num_leaves': 132, 'max_depth': 7, 'min_child_samples': 44, 'subsample': 0.5877784962640498, 'colsample_bytree': 0.9383794320535633}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.217857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:13:19,804] Trial 4 finished with value: 0.39659088020910377 and parameters: {'n_estimators': 400, 'learning_rate': 0.04324152823038429, 'num_leaves': 96, 'max_depth': 8, 'min_child_samples': 10, 'subsample': 0.7496660806063499, 'colsample_bytree': 0.5067535019736132}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:15:51,221] Trial 5 finished with value: 0.3334740798978119 and parameters: {'n_estimators': 1333, 'learning_rate': 0.2598316843073168, 'num_leaves': 50, 'max_depth': 9, 'min_child_samples': 86, 'subsample': 0.9021982829214331, 'colsample_bytree': 0.7293035893721065}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.236513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:19:40,622] Trial 6 finished with value: 0.39942914306835325 and parameters: {'n_estimators': 1275, 'learning_rate': 0.019907224720217404, 'num_leaves': 39, 'max_depth': 14, 'min_child_samples': 84, 'subsample': 0.9167474025015702, 'colsample_bytree': 0.5348841814880856}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.112377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:22:57,376] Trial 7 finished with value: 0.4796642890953001 and parameters: {'n_estimators': 1780, 'learning_rate': 0.01810172355074586, 'num_leaves': 69, 'max_depth': 4, 'min_child_samples': 78, 'subsample': 0.6583412521839512, 'colsample_bytree': 0.9629415047116554}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.205671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:24:59,488] Trial 8 finished with value: 0.35669915106099753 and parameters: {'n_estimators': 1004, 'learning_rate': 0.1186943780372961, 'num_leaves': 35, 'max_depth': 9, 'min_child_samples': 31, 'subsample': 0.9603328899423453, 'colsample_bytree': 0.5643529475897231}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.212040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:28:39,620] Trial 9 finished with value: 0.38441320201332035 and parameters: {'n_estimators': 1081, 'learning_rate': 0.022012869177570477, 'num_leaves': 71, 'max_depth': 8, 'min_child_samples': 73, 'subsample': 0.5991851475334765, 'colsample_bytree': 0.6506471811793437}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.321441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:30:15,628] Trial 10 finished with value: 0.33705812159891785 and parameters: {'n_estimators': 590, 'learning_rate': 0.1023609652171765, 'num_leaves': 147, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.5439245167461, 'colsample_bytree': 0.8210990384113321}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.122545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:32:33,301] Trial 11 finished with value: 0.34194637893711527 and parameters: {'n_estimators': 815, 'learning_rate': 0.28110739823837994, 'num_leaves': 114, 'max_depth': 6, 'min_child_samples': 48, 'subsample': 0.529882006441701, 'colsample_bytree': 0.8398060910035238}. Best is trial 1 with value: 0.3276940109554189.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:35:56,485] Trial 12 finished with value: 0.3226488596708352 and parameters: {'n_estimators': 1500, 'learning_rate': 0.13515884321910768, 'num_leaves': 146, 'max_depth': 11, 'min_child_samples': 32, 'subsample': 0.7677824400188878, 'colsample_bytree': 0.8752163405107918}. Best is trial 12 with value: 0.3226488596708352.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:38:51,811] Trial 13 finished with value: 0.323037388949778 and parameters: {'n_estimators': 1354, 'learning_rate': 0.12705734017992806, 'num_leaves': 149, 'max_depth': 12, 'min_child_samples': 25, 'subsample': 0.8054674931803922, 'colsample_bytree': 0.836044080659577}. Best is trial 12 with value: 0.3226488596708352.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:42:19,196] Trial 14 finished with value: 0.32512787980738994 and parameters: {'n_estimators': 1506, 'learning_rate': 0.08779138132003532, 'num_leaves': 150, 'max_depth': 11, 'min_child_samples': 22, 'subsample': 0.8242948117607131, 'colsample_bytree': 0.847443380415635}. Best is trial 12 with value: 0.3226488596708352.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.224569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:42:50,588] Trial 15 finished with value: 0.3659222211519191 and parameters: {'n_estimators': 143, 'learning_rate': 0.15715645585334684, 'num_leaves': 100, 'max_depth': 11, 'min_child_samples': 29, 'subsample': 0.7945570993587993, 'colsample_bytree': 0.8879426302459957}. Best is trial 12 with value: 0.3226488596708352.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.225902 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:46:19,137] Trial 16 finished with value: 0.3297789240921935 and parameters: {'n_estimators': 1540, 'learning_rate': 0.0751511002627573, 'num_leaves': 113, 'max_depth': 12, 'min_child_samples': 11, 'subsample': 0.7212054802462797, 'colsample_bytree': 0.7584643971564973}. Best is trial 12 with value: 0.3226488596708352.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:48:51,682] Trial 17 finished with value: 0.3211736908603188 and parameters: {'n_estimators': 1200, 'learning_rate': 0.1775333181373022, 'num_leaves': 141, 'max_depth': 11, 'min_child_samples': 33, 'subsample': 0.810160508440456, 'colsample_bytree': 0.9992946256652143}. Best is trial 17 with value: 0.3211736908603188.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:50:43,095] Trial 18 finished with value: 0.3272333977123974 and parameters: {'n_estimators': 817, 'learning_rate': 0.20230020372053137, 'num_leaves': 114, 'max_depth': 10, 'min_child_samples': 37, 'subsample': 0.8621086095840502, 'colsample_bytree': 0.9909766753059154}. Best is trial 17 with value: 0.3211736908603188.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.211145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776


[I 2025-05-05 16:54:35,217] Trial 19 finished with value: 0.3343903404655514 and parameters: {'n_estimators': 1671, 'learning_rate': 0.06565508350325513, 'num_leaves': 88, 'max_depth': 15, 'min_child_samples': 98, 'subsample': 0.7083119263912703, 'colsample_bytree': 0.8881699024011235}. Best is trial 17 with value: 0.3211736908603188.


Best parameters: {'n_estimators': 1200, 'learning_rate': 0.1775333181373022, 'num_leaves': 141, 'max_depth': 11, 'min_child_samples': 33, 'subsample': 0.810160508440456, 'colsample_bytree': 0.9992946256652143}
Best RMSE: 0.3211736908603188


In [15]:
best_params = study.best_params
model = LGBMRegressor(**best_params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        early_stopping(stopping_rounds=50),
    ]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1131
[LightGBM] [Info] Number of data points in the train set: 992040, number of used features: 66
[LightGBM] [Info] Start training from score 3.125776
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1200]	valid_0's rmse: 0.321458	valid_0's l2: 0.103335


In [16]:
y.describe()


Unnamed: 0,ridership
count,1240051.0
mean,3.125297
std,1.375706
min,0.6931472
25%,2.079442
50%,3.258097
75%,4.248495
max,5.545177


In [17]:
model.booster_.save_model("lightgbm_ridership_model_3.txt")

<lightgbm.basic.Booster at 0x7e22d98d1890>

In [18]:
import sklearn.metrics as metrics

y_pred = np.expm1(model.predict(X_val))  # Undo log1p
y_val_actual = np.expm1(y_val)          # Undo log1p

mse = mean_squared_error(y_val_actual, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_actual, y_pred)

print(f"Validation RMSE: {rmse:.2f}")
print(f"Validation MAE: {mae:.2f}")



Validation RMSE: 14.22
Validation MAE: 7.97


In [20]:
pdf_encoded['ridership'].mean()

np.float64(48.5160602265552)

In [22]:
import joblib
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [None]:
def predict_ridership(station, dt, temp, humidity, precipitation, fare_class, payment_method):
    weather_scaled = scaler.transform([[temp, humidity, precipitation]])[0]

    input_dict = {
        'hour': pd.to_datetime(dt).hour,
        'day_of_week': pd.to_datetime(dt).dayofweek,
        'temperature_C': weather_scaled[0],
        'humidity_%': weather_scaled[1],
        'precipitation_mm': weather_scaled[2],
    }

    for col in model.feature_name_:
        if col.startswith('station_complex_'):
            input_dict[col] = 1 if col == f'station_complex_{station}' else 0
        elif col.startswith('fare_class_category_'):
            input_dict[col] = 1 if col == f'fare_class_category_{fare_class}' else 0
        elif col.startswith('payment_method_'):
            input_dict[col] = 1 if col == f'payment_method_{payment_method}' else 0
        elif col not in input_dict:
            input_dict[col] = 0

    input_df = pd.DataFrame([input_dict])
    return np.expm1(model.predict(input_df)[0])  # Undo log1p


In [None]:
def predict_ridership(station, dt, temp, humidity, precipitation, fare_class, payment_method):
    weather_scaled = scaler.transform([[temp, humidity, precipitation]])[0]

    input_dict = {
        'hour': pd.to_datetime(dt).hour,
        'day_of_week': pd.to_datetime(dt).dayofweek,
        'month': pd.to_datetime(dt).month,
        'is_weekend': 1 if pd.to_datetime(dt).dayofweek in [5, 6] else 0,
        'is_holiday': 1 if pd.to_datetime(dt).date() in holidays.US() else 0,
        'ridership_lag1': 0,
        'ridership_lag24': 0,
        'heavy_rain': 1 if precipitation > 10 else 0,
        'high_temp': 1 if temp > 30 else 0,
        'temperature_C': weather_scaled[0],
        'humidity_%': weather_scaled[1],
        'precipitation_mm': weather_scaled[2],
    }

    for col in model.feature_name_:
        if col.startswith('station_complex_'):
            input_dict[col] = 1 if col == f'station_complex_{station}' else 0
        elif col.startswith('fare_class_category_'):
            input_dict[col] = 1 if col == f'fare_class_category_{fare_class}' else 0
        elif col.startswith('payment_method_'):
            input_dict[col] = 1 if col == f'payment_method_{payment_method}' else 0
        elif col not in input_dict:
            input_dict[col] = 0

    input_df = pd.DataFrame([input_dict])
    return np.expm1(model.predict(input_df)[0])

In [23]:
df.select("fare_class_category").distinct().show( truncate = False)

+--------------------------------+
|fare_class_category             |
+--------------------------------+
|Metrocard - Fair Fare           |
|OMNY - Fair Fare                |
|OMNY - Seniors & Disability     |
|OMNY - Full Fare                |
|Metrocard - Unlimited 7-Day     |
|Metrocard - Unlimited 30-Day    |
|Metrocard - Full Fare           |
|Metrocard - Other               |
|Metrocard - Seniors & Disability|
|Metrocard - Students            |
|OMNY - Other                    |
|OMNY - Students                 |
+--------------------------------+

