In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna

In [2]:
Ridership_df= pd.read_csv('data/Ridership.csv')
df= Ridership_df.copy()
df.sample(5)

Unnamed: 0,Year,Month,Day,Week Number,Corridor,Workday,Station,Period,Ridership,N_trains,Covid19
61969,2022,November,10,45,Corridor_6,y,Station_31,Midday,395,1,0
64116,2022,December,25,51,Corridor_3,n,Station_23,Weekend/Holiday,134,2,0
58391,2022,August,24,34,Corridor_3,y,Station_3,Evening,6691,14,0
41438,2021,August,19,33,Corridor_4,y,Station_3,AM Peak,48,2,1
42437,2021,September,9,36,Corridor_2,y,Station_3,AM Peak,225,12,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         64369 non-null  int64 
 1   Month        64369 non-null  object
 2   Day          64369 non-null  int64 
 3   Week Number  64369 non-null  int64 
 4   Corridor     64369 non-null  object
 5   Workday      64369 non-null  object
 6   Station      64369 non-null  object
 7   Period       64369 non-null  object
 8   Ridership    64369 non-null  int64 
 9   N_trains     64369 non-null  int64 
 10  Covid19      64369 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 5.4+ MB


In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df.isnull().sum()

Year           0
Month          0
Day            0
Week Number    0
Corridor       0
Workday        0
Station        0
Period         0
Ridership      0
N_trains       0
Covid19        0
dtype: int64

In [3]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month_Num'] = df['Month'].map(month_mapping)

# Cyclical Day conversion
def convert_day_to_circle(day):
    angle = 2 * np.pi * (day - 1) / 31
    x = np.cos(angle)
    y = np.sin(angle)
    return x, y

df['day_x'], df['day_y'] = zip(*df['Day'].map(convert_day_to_circle))

# Cyclical Week conversion
def convert_week_to_circle(week):
    angle = 2 * np.pi * (week - 1) / 53
    x = np.cos(angle)
    y = np.sin(angle)
    return x, y

df['week_x'], df['week_y'] = zip(*df['Week Number'].map(convert_week_to_circle))


# For Workday, instead of LabelEncoder then OneHotEncoder, let's just let OneHotEncoder handle it directly
categorical_features = ['Month', 'Corridor', 'Workday', 'Station', 'Period']

numerical_features_to_pass_through = [
    'Year', 'N_trains', 'Covid19', 'Ridership',
    'Month_Num', 'day_x', 'day_y', 'week_x', 'week_y']



preprocessor = ColumnTransformer(
    transformers=[
        ('', OneHotEncoder(), categorical_features),
    ],remainder='passthrough')


df = preprocessor.fit_transform(df)
df = df.toarray()
feature_names_out = list(preprocessor.get_feature_names_out())
df = pd.DataFrame(df, columns=[item.split('__')[1] for item in feature_names_out])

df['COVID_Workday'] = df['Covid19'] * df['Workday_y']

In [5]:
def feature_engineering(df):
    # Month mapping
    month_mapping = {
        'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
        'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
    }
    df['Month_Num'] = df['Month'].map(month_mapping)

    # Cyclical Day conversion
    def convert_day_to_circle(day):
        angle = 2 * np.pi * (day - 1) / 31
        x = np.cos(angle)
        y = np.sin(angle)
        return x, y

    df['day_x'], df['day_y'] = zip(*df['Day'].map(convert_day_to_circle))

    # Cyclical Week conversion
    def convert_week_to_circle(week):
        angle = 2 * np.pi * (week - 1) / 53
        x = np.cos(angle)
        y = np.sin(angle)
        return x, y

    df['week_x'], df['week_y'] = zip(*df['Week Number'].map(convert_week_to_circle))

    # Categorical features for OneHotEncoder
    categorical_features = ['Month', 'Corridor', 'Workday', 'Station', 'Period']

    # Column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('', OneHotEncoder(), categorical_features),
        ], remainder='passthrough'
    )

    # Fit and transform the DataFrame
    df_transformed = preprocessor.fit_transform(df)

    # Convert the encoded features to a dense matrix
    df_transformed = df_transformed.toarray()

    # Convert to DataFrame and get feature names
    feature_names_out = list(preprocessor.get_feature_names_out())
    
    # Convert the dense matrix to a DataFrame
    df_transformed = pd.DataFrame(df_transformed, columns=[item.split('__')[1] for item in feature_names_out])

    return df_transformed

In [6]:
df= feature_engineering(df)

In [9]:
df['COVID_Workday'] = df['Covid19'] * df['Workday_y']

In [11]:
columns_to_convert = [
    'Month_April', 'Month_August', 'Month_December', 'Month_February', 
    'Month_January', 'Month_July', 'Month_June', 'Month_March', 
    'Month_May', 'Month_November', 'Month_October', 'Month_September',
    'Corridor_Corridor_1', 'Corridor_Corridor_2', 'Corridor_Corridor_3', 
    'Corridor_Corridor_4', 'Corridor_Corridor_5', 'Corridor_Corridor_6', 'Corridor_Corridor_7',
    'Station_Station_1', 'Station_Station_10', 'Station_Station_11', 'Station_Station_12', 
    'Station_Station_13', 'Station_Station_14', 'Station_Station_15', 'Station_Station_16', 
    'Station_Station_17', 'Station_Station_18', 'Station_Station_19', 'Station_Station_2', 
    'Station_Station_20', 'Station_Station_21', 'Station_Station_22', 'Station_Station_23', 
    'Station_Station_24', 'Station_Station_25', 'Station_Station_26', 'Station_Station_27', 
    'Station_Station_28', 'Station_Station_29', 'Station_Station_3', 'Station_Station_30', 
    'Station_Station_31', 'Station_Station_32', 'Station_Station_33', 'Station_Station_34', 
    'Station_Station_35', 'Station_Station_36', 'Station_Station_37', 'Station_Station_38', 
    'Station_Station_39', 'Station_Station_4', 'Station_Station_40', 'Station_Station_41', 
    'Station_Station_42', 'Station_Station_43', 'Station_Station_44', 'Station_Station_45', 
    'Station_Station_5', 'Station_Station_6', 'Station_Station_7', 'Station_Station_8', 'Station_Station_9',
    'Period_AM Peak', 'Period_Evening', 'Period_Midday', 'Period_PM Peak', 'Period_Weekend/Holiday',
    'Day',
    'Week Number',
    'Workday_n', 'Workday_y',
    'Covid19','COVID_Workday',
    'Month_Num']

columns_to_convert_02=['Year', 'Ridership', 'N_trains']


df[columns_to_convert] = df[columns_to_convert].astype('uint8')
df[columns_to_convert_02] = df[columns_to_convert_02].astype('uint16')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 83 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Month_April             64369 non-null  uint8  
 1   Month_August            64369 non-null  uint8  
 2   Month_December          64369 non-null  uint8  
 3   Month_February          64369 non-null  uint8  
 4   Month_January           64369 non-null  uint8  
 5   Month_July              64369 non-null  uint8  
 6   Month_June              64369 non-null  uint8  
 7   Month_March             64369 non-null  uint8  
 8   Month_May               64369 non-null  uint8  
 9   Month_November          64369 non-null  uint8  
 10  Month_October           64369 non-null  uint8  
 11  Month_September         64369 non-null  uint8  
 12  Corridor_Corridor_1     64369 non-null  uint8  
 13  Corridor_Corridor_2     64369 non-null  uint8  
 14  Corridor_Corridor_3     64369 non-null

## Problem One: 
Assume we want to find a general model for the number of passengers without looking at the number of passengers on previous days and only using the recorded information in the table for the same time period (except for the year and number of trains) to predict the number of required trains based on the number of passengers.

# spliting

In [13]:
X_p01 = df.drop(['Ridership', 'Year', 'N_trains'], axis=1)  
y_p01 = pd.DataFrame(df['Ridership'])

df_for_stratify_p01 = Ridership_df.copy()
stratify_key_p01 = df_for_stratify_p01['Covid19'].astype(str) + '_' + \
                   df_for_stratify_p01['Workday'] + '_' + \
                   df_for_stratify_p01['Period'] + '_' + \
                   df_for_stratify_p01['Corridor']

X_train_p01, X_test_p01, y_train_p01, y_test_p01 = train_test_split(
    X_p01, y_p01,
    test_size=0.2,
    random_state=42,
    stratify=stratify_key_p01 
)

In [14]:
train_data = lgb.Dataset(X_train_p01, label=y_train_p01)
test_data = lgb.Dataset(X_test_p01, label=y_test_p01, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'verbose': -1
}

# Train the model
model_p01 = lgb.train(params,
                 train_data,
                 num_boost_round=1000,
                 valid_sets=[test_data],
                 callbacks=[lgb.early_stopping(stopping_rounds=50)])

# Make predictions
y_pred = model_p01.predict(X_test_p01)

# Evaluate
rmse_f_p01 = np.sqrt(mean_squared_error(y_test_p01, y_pred))
r2_f_p01= r2_score(y_test_p01, y_pred)

print(f"RMSE: {rmse_f_p01:.2f}")
print(f"R²: {r2_f_p01:.4f}")

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 825.099
RMSE: 825.10
R²: 0.8129


In [15]:
def objective(trial):
    # Define parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10),
        'verbose': -1,
        'feature_pre_filter': False  # Disable feature pre-filtering
    }

    # Convert data to LightGBM Dataset format inside the objective function
    train_data = lgb.Dataset(X_train_p01, label=y_train_p01)
    test_data = lgb.Dataset(X_test_p01, label=y_test_p01, reference=train_data)

    # Train the model
    model = lgb.train(params,
                      train_data,
                      num_boost_round=1000,
                      valid_sets=[test_data],
                      callbacks=[lgb.early_stopping(stopping_rounds=50)])
    
    # Make predictions
    y_pred = model.predict(X_test_p01)
    
    # Return RMSE as the objective value to minimize
    return np.sqrt(mean_squared_error(y_test_p01, y_pred))

# Create a study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best parameters
best_params = study.best_params
print("Best parameters:", best_params)


[I 2025-07-21 16:29:52,254] A new study created in memory with name: no-name-6c33bd2e-4cdb-43bc-a14e-fe483a36bde6


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:29:54,513] Trial 0 finished with value: 1050.2862737363903 and parameters: {'num_leaves': 36, 'learning_rate': 0.04804586357958558, 'min_child_samples': 69, 'max_depth': 4, 'feature_fraction': 0.5908692365697692, 'bagging_fraction': 0.6467671516331035, 'bagging_freq': 4, 'lambda_l1': 3.1183145072715868, 'lambda_l2': 9.607963479519166}. Best is trial 0 with value: 1050.2862737363903.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1050.29
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:29:56,614] Trial 1 finished with value: 986.9845532707903 and parameters: {'num_leaves': 31, 'learning_rate': 0.10340654809271028, 'min_child_samples': 45, 'max_depth': 4, 'feature_fraction': 0.8592227279999444, 'bagging_fraction': 0.762974008758165, 'bagging_freq': 6, 'lambda_l1': 2.91068730888814, 'lambda_l2': 1.4463650621852808}. Best is trial 1 with value: 986.9845532707903.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 986.985
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 883.305


[I 2025-07-21 16:30:02,488] Trial 2 finished with value: 883.3045046437658 and parameters: {'num_leaves': 92, 'learning_rate': 0.06708765675330731, 'min_child_samples': 61, 'max_depth': 9, 'feature_fraction': 0.8170264850759146, 'bagging_fraction': 0.6756738960023683, 'bagging_freq': 7, 'lambda_l1': 9.486909827397268, 'lambda_l2': 5.5287984322888315}. Best is trial 2 with value: 883.3045046437658.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1016.02


[I 2025-07-21 16:30:05,091] Trial 3 finished with value: 1016.0192864453284 and parameters: {'num_leaves': 91, 'learning_rate': 0.26813933282100016, 'min_child_samples': 11, 'max_depth': 3, 'feature_fraction': 0.8365978644441383, 'bagging_fraction': 0.9226006746074484, 'bagging_freq': 10, 'lambda_l1': 8.055018662292982, 'lambda_l2': 9.740134908479998}. Best is trial 2 with value: 883.3045046437658.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:30:07,829] Trial 4 finished with value: 1061.8549698279633 and parameters: {'num_leaves': 45, 'learning_rate': 0.19151584960920676, 'min_child_samples': 58, 'max_depth': 3, 'feature_fraction': 0.8227018856378381, 'bagging_fraction': 0.8318715341530971, 'bagging_freq': 7, 'lambda_l1': 5.4199923023887635, 'lambda_l2': 4.077196665029198}. Best is trial 2 with value: 883.3045046437658.


Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 1061.85
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:30:09,614] Trial 5 finished with value: 872.6736882701654 and parameters: {'num_leaves': 99, 'learning_rate': 0.2250002301820668, 'min_child_samples': 12, 'max_depth': 7, 'feature_fraction': 0.8704870516168004, 'bagging_fraction': 0.72305984223737, 'bagging_freq': 5, 'lambda_l1': 3.030825858618625, 'lambda_l2': 7.840846213362869}. Best is trial 5 with value: 872.6736882701654.


Early stopping, best iteration is:
[255]	valid_0's rmse: 872.674
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:30:11,335] Trial 6 finished with value: 887.2673956079824 and parameters: {'num_leaves': 96, 'learning_rate': 0.2285390464797929, 'min_child_samples': 20, 'max_depth': 6, 'feature_fraction': 0.671320452611806, 'bagging_fraction': 0.781384656099416, 'bagging_freq': 6, 'lambda_l1': 5.744853277315112, 'lambda_l2': 5.633921202053997}. Best is trial 5 with value: 872.6736882701654.


Early stopping, best iteration is:
[428]	valid_0's rmse: 887.267
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[559]	valid_0's rmse: 849.924


[I 2025-07-21 16:30:14,340] Trial 7 finished with value: 849.9239337310938 and parameters: {'num_leaves': 97, 'learning_rate': 0.2700431468948863, 'min_child_samples': 66, 'max_depth': 8, 'feature_fraction': 0.8726663897408324, 'bagging_fraction': 0.9751464814983707, 'bagging_freq': 2, 'lambda_l1': 4.527523144410473, 'lambda_l2': 1.1732376491675622}. Best is trial 7 with value: 849.9239337310938.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:30:15,154] Trial 8 finished with value: 865.5360891090161 and parameters: {'num_leaves': 64, 'learning_rate': 0.2777567391146027, 'min_child_samples': 22, 'max_depth': 11, 'feature_fraction': 0.8456104376877219, 'bagging_fraction': 0.7170692961157178, 'bagging_freq': 5, 'lambda_l1': 4.597693718144996, 'lambda_l2': 2.017103445367485}. Best is trial 7 with value: 849.9239337310938.


Early stopping, best iteration is:
[152]	valid_0's rmse: 865.536
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[990]	valid_0's rmse: 911.66


[I 2025-07-21 16:30:17,759] Trial 9 finished with value: 911.6603967142736 and parameters: {'num_leaves': 70, 'learning_rate': 0.2294466376701668, 'min_child_samples': 79, 'max_depth': 5, 'feature_fraction': 0.8390790025653965, 'bagging_fraction': 0.7951619418580643, 'bagging_freq': 3, 'lambda_l1': 7.627433338920429, 'lambda_l2': 2.4600097480979866}. Best is trial 7 with value: 849.9239337310938.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[992]	valid_0's rmse: 843.733


[I 2025-07-21 16:30:20,996] Trial 10 finished with value: 843.7325197402778 and parameters: {'num_leaves': 77, 'learning_rate': 0.14761636694149424, 'min_child_samples': 96, 'max_depth': 9, 'feature_fraction': 0.9784152090339887, 'bagging_fraction': 0.5168837297468503, 'bagging_freq': 0, 'lambda_l1': 1.0521333189269777, 'lambda_l2': 0.05255792852259811}. Best is trial 10 with value: 843.7325197402778.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 845.798


[I 2025-07-21 16:30:24,146] Trial 11 finished with value: 845.7977511431845 and parameters: {'num_leaves': 79, 'learning_rate': 0.13514229051533172, 'min_child_samples': 98, 'max_depth': 9, 'feature_fraction': 0.9908417564396179, 'bagging_fraction': 0.5401519470784061, 'bagging_freq': 0, 'lambda_l1': 0.20035219614524635, 'lambda_l2': 0.1402739219435013}. Best is trial 10 with value: 843.7325197402778.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 834.958


[I 2025-07-21 16:30:27,554] Trial 12 finished with value: 834.9583001319297 and parameters: {'num_leaves': 76, 'learning_rate': 0.1377676308513362, 'min_child_samples': 99, 'max_depth': 10, 'feature_fraction': 0.9960691850925317, 'bagging_fraction': 0.5366211813605977, 'bagging_freq': 0, 'lambda_l1': 0.21666213809758328, 'lambda_l2': 0.04674400508752875}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[981]	valid_0's rmse: 836.606


[I 2025-07-21 16:30:30,777] Trial 13 finished with value: 836.6058296574741 and parameters: {'num_leaves': 54, 'learning_rate': 0.153916016626792, 'min_child_samples': 100, 'max_depth': 12, 'feature_fraction': 0.9906520291202959, 'bagging_fraction': 0.5084322660332472, 'bagging_freq': 0, 'lambda_l1': 0.13248899396441055, 'lambda_l2': 0.19371706437058556}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[995]	valid_0's rmse: 888.68


[I 2025-07-21 16:30:34,065] Trial 14 finished with value: 888.6799013160472 and parameters: {'num_leaves': 50, 'learning_rate': 0.10419753910525056, 'min_child_samples': 83, 'max_depth': 12, 'feature_fraction': 0.9515376384434174, 'bagging_fraction': 0.5938907496736168, 'bagging_freq': 1, 'lambda_l1': 1.5748449223392444, 'lambda_l2': 3.5319732791883247}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:30:36,241] Trial 15 finished with value: 905.1969494296618 and parameters: {'num_leaves': 53, 'learning_rate': 0.17727184878878702, 'min_child_samples': 85, 'max_depth': 11, 'feature_fraction': 0.7178235498858005, 'bagging_fraction': 0.5896255107898756, 'bagging_freq': 2, 'lambda_l1': 1.470105715723864, 'lambda_l2': 3.2106706637134117}. Best is trial 12 with value: 834.9583001319297.


Early stopping, best iteration is:
[477]	valid_0's rmse: 905.197
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 884.818


[I 2025-07-21 16:30:38,056] Trial 16 finished with value: 884.8180974805156 and parameters: {'num_leaves': 20, 'learning_rate': 0.09834470953905684, 'min_child_samples': 43, 'max_depth': 12, 'feature_fraction': 0.9208178665007999, 'bagging_fraction': 0.5120471887779736, 'bagging_freq': 0, 'lambda_l1': 0.10875424411777146, 'lambda_l2': 7.025560045066319}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[761]	valid_0's rmse: 909.491


[I 2025-07-21 16:30:41,564] Trial 17 finished with value: 909.491463113254 and parameters: {'num_leaves': 62, 'learning_rate': 0.17391853807046787, 'min_child_samples': 100, 'max_depth': 10, 'feature_fraction': 0.5497103834680752, 'bagging_fraction': 0.5922419301027595, 'bagging_freq': 2, 'lambda_l1': 2.135572348463498, 'lambda_l2': 0.42799837843694405}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[459]	valid_0's rmse: 896.73


[I 2025-07-21 16:30:44,219] Trial 18 finished with value: 896.7303809846226 and parameters: {'num_leaves': 82, 'learning_rate': 0.12997749029381095, 'min_child_samples': 88, 'max_depth': 11, 'feature_fraction': 0.9293123542308573, 'bagging_fraction': 0.6462699443492947, 'bagging_freq': 10, 'lambda_l1': 0.15581607819734603, 'lambda_l2': 1.3706450672478376}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 917.695


[I 2025-07-21 16:30:48,293] Trial 19 finished with value: 917.6947222594863 and parameters: {'num_leaves': 68, 'learning_rate': 0.035193698127551146, 'min_child_samples': 74, 'max_depth': 10, 'feature_fraction': 0.7602679141638277, 'bagging_fraction': 0.5572001431463164, 'bagging_freq': 1, 'lambda_l1': 3.71642214207614, 'lambda_l2': 4.406125781309851}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[874]	valid_0's rmse: 919.756


[I 2025-07-21 16:30:52,350] Trial 20 finished with value: 919.7564194469717 and parameters: {'num_leaves': 55, 'learning_rate': 0.07493002226659283, 'min_child_samples': 91, 'max_depth': 12, 'feature_fraction': 0.99463203008729, 'bagging_fraction': 0.5016143471680502, 'bagging_freq': 3, 'lambda_l1': 7.022780801463618, 'lambda_l2': 2.5987816015742906}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[992]	valid_0's rmse: 846.518


[I 2025-07-21 16:30:55,797] Trial 21 finished with value: 846.5175250249788 and parameters: {'num_leaves': 77, 'learning_rate': 0.14805325164306885, 'min_child_samples': 100, 'max_depth': 9, 'feature_fraction': 0.9494599063465817, 'bagging_fraction': 0.5526454415112105, 'bagging_freq': 0, 'lambda_l1': 1.1492408377091643, 'lambda_l2': 0.2332589950118536}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[760]	valid_0's rmse: 903.278


[I 2025-07-21 16:30:59,488] Trial 22 finished with value: 903.2783477134345 and parameters: {'num_leaves': 85, 'learning_rate': 0.16107170860746062, 'min_child_samples': 92, 'max_depth': 10, 'feature_fraction': 0.9982393791472124, 'bagging_fraction': 0.5091168912426699, 'bagging_freq': 1, 'lambda_l1': 1.0041004058229783, 'lambda_l2': 0.9101622670475843}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 856.88


[I 2025-07-21 16:31:02,473] Trial 23 finished with value: 856.8795897293458 and parameters: {'num_leaves': 72, 'learning_rate': 0.13211207711304107, 'min_child_samples': 92, 'max_depth': 8, 'feature_fraction': 0.9039378522843016, 'bagging_fraction': 0.6236214449044006, 'bagging_freq': 0, 'lambda_l1': 2.2325005802342424, 'lambda_l2': 0.0234968844333383}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[752]	valid_0's rmse: 905.861


[I 2025-07-21 16:31:04,959] Trial 24 finished with value: 905.861483960984 and parameters: {'num_leaves': 43, 'learning_rate': 0.19873244271790202, 'min_child_samples': 77, 'max_depth': 7, 'feature_fraction': 0.9622966445037271, 'bagging_fraction': 0.5599141812319001, 'bagging_freq': 1, 'lambda_l1': 0.6569511554129531, 'lambda_l2': 1.779386753036658}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[983]	valid_0's rmse: 863.843


[I 2025-07-21 16:31:09,198] Trial 25 finished with value: 863.8425355930349 and parameters: {'num_leaves': 59, 'learning_rate': 0.11387290865315036, 'min_child_samples': 94, 'max_depth': 11, 'feature_fraction': 0.907449228848076, 'bagging_fraction': 0.8604114514311344, 'bagging_freq': 3, 'lambda_l1': 2.088929905921214, 'lambda_l2': 0.8385817851802992}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[969]	valid_0's rmse: 838.703


[I 2025-07-21 16:31:12,741] Trial 26 finished with value: 838.7025431418322 and parameters: {'num_leaves': 75, 'learning_rate': 0.20788008486403148, 'min_child_samples': 84, 'max_depth': 10, 'feature_fraction': 0.753700652237636, 'bagging_fraction': 0.6894835285393359, 'bagging_freq': 0, 'lambda_l1': 0.8531987407066899, 'lambda_l2': 2.937813244126048}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:31:14,825] Trial 27 finished with value: 893.7028039059406 and parameters: {'num_leaves': 73, 'learning_rate': 0.24339669582174797, 'min_child_samples': 84, 'max_depth': 10, 'feature_fraction': 0.6553473230737185, 'bagging_fraction': 0.6913140478283806, 'bagging_freq': 2, 'lambda_l1': 1.8666550606215953, 'lambda_l2': 2.9121741567353006}. Best is trial 12 with value: 834.9583001319297.


Early stopping, best iteration is:
[382]	valid_0's rmse: 893.703
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:31:16,447] Trial 28 finished with value: 882.0656633448119 and parameters: {'num_leaves': 87, 'learning_rate': 0.20227401549138385, 'min_child_samples': 47, 'max_depth': 12, 'feature_fraction': 0.7857950661506948, 'bagging_fraction': 0.6288473721346775, 'bagging_freq': 1, 'lambda_l1': 0.6282859944551134, 'lambda_l2': 2.0679128455694378}. Best is trial 12 with value: 834.9583001319297.


Early stopping, best iteration is:
[290]	valid_0's rmse: 882.066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[825]	valid_0's rmse: 890.284


[I 2025-07-21 16:31:19,484] Trial 29 finished with value: 890.2841178426639 and parameters: {'num_leaves': 37, 'learning_rate': 0.17053924122460348, 'min_child_samples': 70, 'max_depth': 11, 'feature_fraction': 0.600761113961642, 'bagging_fraction': 0.6767817831886709, 'bagging_freq': 4, 'lambda_l1': 3.385148036645753, 'lambda_l2': 8.596197555236982}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[950]	valid_0's rmse: 891.477


[I 2025-07-21 16:31:23,609] Trial 30 finished with value: 891.4769289086919 and parameters: {'num_leaves': 64, 'learning_rate': 0.08149939527435322, 'min_child_samples': 37, 'max_depth': 10, 'feature_fraction': 0.5003869789243991, 'bagging_fraction': 0.6051854047047247, 'bagging_freq': 4, 'lambda_l1': 2.6675812681379316, 'lambda_l2': 6.64065457199238}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[993]	valid_0's rmse: 841.366


[I 2025-07-21 16:31:26,986] Trial 31 finished with value: 841.3657465996974 and parameters: {'num_leaves': 74, 'learning_rate': 0.15008313388321154, 'min_child_samples': 95, 'max_depth': 9, 'feature_fraction': 0.9689391271264607, 'bagging_fraction': 0.5385390417807757, 'bagging_freq': 0, 'lambda_l1': 1.0787971298338712, 'lambda_l2': 0.78826681445817}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's rmse: 849.077


[I 2025-07-21 16:31:29,930] Trial 32 finished with value: 849.077070742844 and parameters: {'num_leaves': 58, 'learning_rate': 0.19001310743878458, 'min_child_samples': 87, 'max_depth': 8, 'feature_fraction': 0.8875285164252916, 'bagging_fraction': 0.5650915748141196, 'bagging_freq': 0, 'lambda_l1': 0.08471061226803364, 'lambda_l2': 0.8536205253726588}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[912]	valid_0's rmse: 902.733


[I 2025-07-21 16:31:33,735] Trial 33 finished with value: 902.7334089937312 and parameters: {'num_leaves': 68, 'learning_rate': 0.12198000719958227, 'min_child_samples': 80, 'max_depth': 9, 'feature_fraction': 0.9347855703960121, 'bagging_fraction': 0.5373522622842614, 'bagging_freq': 1, 'lambda_l1': 0.7668869127553306, 'lambda_l2': 1.543760496334583}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[645]	valid_0's rmse: 891.718


[I 2025-07-21 16:31:36,941] Trial 34 finished with value: 891.7183940505017 and parameters: {'num_leaves': 74, 'learning_rate': 0.1492061145701436, 'min_child_samples': 72, 'max_depth': 10, 'feature_fraction': 0.6870574267482314, 'bagging_fraction': 0.6484980772238169, 'bagging_freq': 9, 'lambda_l1': 1.4043355216265128, 'lambda_l2': 0.6077102098770482}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[798]	valid_0's rmse: 837.184


[I 2025-07-21 16:31:40,079] Trial 35 finished with value: 837.1843049957104 and parameters: {'num_leaves': 85, 'learning_rate': 0.2952791363820524, 'min_child_samples': 96, 'max_depth': 9, 'feature_fraction': 0.9506992794045015, 'bagging_fraction': 0.8737862195974554, 'bagging_freq': 0, 'lambda_l1': 4.0560039257891445, 'lambda_l2': 4.848203371453419}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[670]	valid_0's rmse: 879.895


[I 2025-07-21 16:31:43,274] Trial 36 finished with value: 879.8952060107067 and parameters: {'num_leaves': 90, 'learning_rate': 0.293177172177121, 'min_child_samples': 100, 'max_depth': 7, 'feature_fraction': 0.7963504602999512, 'bagging_fraction': 0.886876708383117, 'bagging_freq': 2, 'lambda_l1': 3.9090053762344805, 'lambda_l2': 5.176268635294142}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[415]	valid_0's rmse: 853.307


[I 2025-07-21 16:31:45,115] Trial 37 finished with value: 853.3071632467036 and parameters: {'num_leaves': 83, 'learning_rate': 0.2527412178621055, 'min_child_samples': 64, 'max_depth': 11, 'feature_fraction': 0.6142918618566882, 'bagging_fraction': 0.9801828025145498, 'bagging_freq': 1, 'lambda_l1': 9.853981230454727, 'lambda_l2': 3.8968676396520077}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[556]	valid_0's rmse: 893.063


[I 2025-07-21 16:31:47,803] Trial 38 finished with value: 893.0625799230655 and parameters: {'num_leaves': 80, 'learning_rate': 0.20665752326988848, 'min_child_samples': 88, 'max_depth': 8, 'feature_fraction': 0.7140274781472646, 'bagging_fraction': 0.7495791819366333, 'bagging_freq': 9, 'lambda_l1': 6.355864252294252, 'lambda_l2': 6.339312732724121}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 859.928


[I 2025-07-21 16:31:53,475] Trial 39 finished with value: 859.9279997202201 and parameters: {'num_leaves': 94, 'learning_rate': 0.05511960723581756, 'min_child_samples': 90, 'max_depth': 12, 'feature_fraction': 0.8868809647375078, 'bagging_fraction': 0.93972671004717, 'bagging_freq': 7, 'lambda_l1': 2.6142951835767896, 'lambda_l2': 3.873426358641595}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[993]	valid_0's rmse: 952.389


[I 2025-07-21 16:31:55,805] Trial 40 finished with value: 952.3891107458228 and parameters: {'num_leaves': 88, 'learning_rate': 0.24999779071737951, 'min_child_samples': 80, 'max_depth': 4, 'feature_fraction': 0.8096689084384427, 'bagging_fraction': 0.8229257391326145, 'bagging_freq': 3, 'lambda_l1': 5.318568652615222, 'lambda_l2': 4.8138669198785085}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[694]	valid_0's rmse: 840.706


[I 2025-07-21 16:31:58,330] Trial 41 finished with value: 840.7056493137875 and parameters: {'num_leaves': 66, 'learning_rate': 0.29942107441899857, 'min_child_samples': 95, 'max_depth': 9, 'feature_fraction': 0.9574460183647339, 'bagging_fraction': 0.9128530770877628, 'bagging_freq': 0, 'lambda_l1': 9.15311280784517, 'lambda_l2': 2.2428299688013418}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[990]	valid_0's rmse: 839.613


[I 2025-07-21 16:32:01,150] Trial 42 finished with value: 839.6134481321652 and parameters: {'num_leaves': 48, 'learning_rate': 0.28775907841293363, 'min_child_samples': 96, 'max_depth': 9, 'feature_fraction': 0.9686526638535972, 'bagging_fraction': 0.9033988833990396, 'bagging_freq': 0, 'lambda_l1': 8.971815904195722, 'lambda_l2': 2.441700386219417}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:32:02,937] Trial 43 finished with value: 868.1136168072462 and parameters: {'num_leaves': 47, 'learning_rate': 0.2871313331837105, 'min_child_samples': 58, 'max_depth': 8, 'feature_fraction': 0.9765047169168777, 'bagging_fraction': 0.8361877233315403, 'bagging_freq': 1, 'lambda_l1': 7.997923922301695, 'lambda_l2': 5.772061387564094}. Best is trial 12 with value: 834.9583001319297.


Early stopping, best iteration is:
[516]	valid_0's rmse: 868.114
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 843.826


[I 2025-07-21 16:32:05,278] Trial 44 finished with value: 843.8255222284591 and parameters: {'num_leaves': 35, 'learning_rate': 0.2657013677387825, 'min_child_samples': 96, 'max_depth': 10, 'feature_fraction': 0.9381185313466215, 'bagging_fraction': 0.9532587855901, 'bagging_freq': 0, 'lambda_l1': 4.452349967512044, 'lambda_l2': 3.111431530356371}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[987]	valid_0's rmse: 886.343


[I 2025-07-21 16:32:08,735] Trial 45 finished with value: 886.3426435372365 and parameters: {'num_leaves': 39, 'learning_rate': 0.2200476472507201, 'min_child_samples': 84, 'max_depth': 6, 'feature_fraction': 0.8510624861905249, 'bagging_fraction': 0.8805038750532734, 'bagging_freq': 2, 'lambda_l1': 6.13547634930838, 'lambda_l2': 4.504501003262311}. Best is trial 12 with value: 834.9583001319297.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:32:10,514] Trial 46 finished with value: 822.8148046648784 and parameters: {'num_leaves': 51, 'learning_rate': 0.2763464521237804, 'min_child_samples': 27, 'max_depth': 9, 'feature_fraction': 0.9128801024106324, 'bagging_fraction': 0.8016549723459934, 'bagging_freq': 0, 'lambda_l1': 8.407846944542449, 'lambda_l2': 1.4198530959205615}. Best is trial 46 with value: 822.8148046648784.


Early stopping, best iteration is:
[633]	valid_0's rmse: 822.815
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:32:11,344] Trial 47 finished with value: 865.1518020366356 and parameters: {'num_leaves': 53, 'learning_rate': 0.2693755764532454, 'min_child_samples': 30, 'max_depth': 11, 'feature_fraction': 0.9062487728543184, 'bagging_fraction': 0.7478655824674444, 'bagging_freq': 1, 'lambda_l1': 8.565886440116314, 'lambda_l2': 1.2749130583025474}. Best is trial 46 with value: 822.8148046648784.


Early stopping, best iteration is:
[168]	valid_0's rmse: 865.152
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[421]	valid_0's rmse: 831.354


[I 2025-07-21 16:32:13,304] Trial 48 finished with value: 831.3539034195618 and parameters: {'num_leaves': 100, 'learning_rate': 0.25858111725450494, 'min_child_samples': 20, 'max_depth': 8, 'feature_fraction': 0.8746283263852219, 'bagging_fraction': 0.8088279975979993, 'bagging_freq': 0, 'lambda_l1': 7.210962490893447, 'lambda_l2': 1.3827501169804381}. Best is trial 46 with value: 822.8148046648784.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 16:32:14,872] Trial 49 finished with value: 878.7385451551069 and parameters: {'num_leaves': 97, 'learning_rate': 0.23694845609382892, 'min_child_samples': 8, 'max_depth': 6, 'feature_fraction': 0.8805695053308413, 'bagging_fraction': 0.7921449989646442, 'bagging_freq': 6, 'lambda_l1': 7.124253594403106, 'lambda_l2': 1.7109766946726916}. Best is trial 46 with value: 822.8148046648784.


Early stopping, best iteration is:
[352]	valid_0's rmse: 878.739
Best parameters: {'num_leaves': 51, 'learning_rate': 0.2763464521237804, 'min_child_samples': 27, 'max_depth': 9, 'feature_fraction': 0.9128801024106324, 'bagging_fraction': 0.8016549723459934, 'bagging_freq': 0, 'lambda_l1': 8.407846944542449, 'lambda_l2': 1.4198530959205615}


In [16]:
# Train final model with all data and best parameters
lgb_model_p01 = lgb.train(best_params,
                       lgb.Dataset(pd.concat([X_train_p01, X_test_p01]), 
                                 label=pd.concat([y_train_p01, y_test_p01])),
                       num_boost_round=1000)

y_pred_01 = lgb_model_p01.predict(X_test_p01)

# Evaluate
rmse_p01 = np.sqrt(mean_squared_error(y_test_p01, y_pred_01 ))
r2_p01 = r2_score(y_test_p01, y_pred_01 )

print(f"RMSE: {rmse_p01 :.2f}")
print(f"R²: {r2_p01 :.4f}")

RMSE: 606.44
R²: 0.8989


In [17]:
y_pred_train_p01 = pd.DataFrame(y_pred_01, columns=['Predicted_Passengers']).astype(int)

train_capacity = 600  
y_pred_train_p01['Predicted_Trains'] = np.ceil(y_pred_train_p01['Predicted_Passengers'] / train_capacity).astype(int)

# Add a Safety/Comfort Buffer (e.g., 5% more capacity)
# This allocates slightly more trains than the absolute minimum required.
safety_buffer_percentage = 0.05 
y_pred_train_p01['Buffered_Predicted_Trains'] = np.ceil(
    y_pred_train_p01['Predicted_Passengers'] * (1 + safety_buffer_percentage) / train_capacity).astype(int)

# 4. Ensure a Minimum Number of Trains (e.g., at least 1 train, even for 0 passengers)
minimum_trains_required = 1
y_pred_train_p01['Final_Predicted_Trains'] = y_pred_train_p01['Buffered_Predicted_Trains'].apply(
    lambda x: max(x, minimum_trains_required))

y_pred_train_p01.sample(10)

Unnamed: 0,Predicted_Passengers,Predicted_Trains,Buffered_Predicted_Trains,Final_Predicted_Trains
8469,334,1,1,1
6406,447,1,1,1
1580,182,1,1,1
2964,308,1,1,1
4557,2381,4,5,5
4535,7164,12,13,13
9722,444,1,1,1
8172,233,1,1,1
9912,931,2,2,2
6276,4512,8,8,8


In [18]:
df.sample(5)

Unnamed: 0,Month_April,Month_August,Month_December,Month_February,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,...,Week Number,Ridership,N_trains,Covid19,Month_Num,day_x,day_y,week_x,week_y,COVID_Workday
32472,0,0,1,0,0,0,0,0,0,0,...,52,44,1,1,12,-0.250653,-0.968077,0.972023,-0.234886,1
28009,0,0,0,0,0,0,0,0,0,0,...,37,1322,7,1,9,-0.050649,0.998717,-0.430065,-0.902798,1
57184,0,0,0,0,0,1,0,0,0,0,...,30,3370,9,0,7,0.688967,-0.724793,-0.956401,-0.292057,0
53697,0,0,0,0,0,0,0,0,1,0,...,19,583,7,0,5,-0.440394,0.897805,-0.533823,0.845596,0
39569,0,0,0,0,0,1,0,0,0,0,...,26,9,1,1,7,0.97953,0.201299,-0.984231,0.17689,1


# Problem Two:
 Assume we need to forecast the number of passengers for different time periods in order to allocate the appropriate number of trains one week in advance. (In this case, we are allowed to use information from previous time periods, but using the year and number of trains columns is still not permitted.)

In [36]:
def preprocess_and_split_data_p02(df, target_column='Ridership', train_size=0.8):
    # Convert Year, Month_Num, and Day to a datetime object
    df['Date'] = pd.to_datetime(
        df['Year'].astype(str) + '-' +
        df['Month_Num'].astype(str) + '-' +
        df['Day'].astype(str),
        errors='coerce'  # Handle potential invalid dates by setting them to NaT
    )

    # Drop rows with invalid dates
    df.dropna(subset=['Date'], inplace=True)

    # Sort by Date
    df.sort_values(by=['Date'], inplace=True)

    # Reset index
    df.reset_index(drop=True, inplace=True)

    # Define features (X) and target (y)
    X = df.drop(columns=[target_column, 'Year', 'N_trains', 'Date'])
    y = df[target_column]

    # Calculate split point
    split_point = int(len(df) * train_size)

    # Split the data into training and testing sets
    X_train = X.iloc[:split_point]
    y_train = y.iloc[:split_point]
    
    X_test = X.iloc[split_point:]
    y_test = y.iloc[split_point:]

    # Print shapes of the resulting datasets
    print(f"\nTrain set features (X_train) shape: {X_train.shape}")
    print(f"Test set features (X_test) shape: {X_test.shape}")
    print(f"Train set target (y_train) shape: {y_train.shape}")
    print(f"Test set target (y_test) shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

In [37]:
X_train_p02, X_test_p02, y_train_p02, y_test_p02 = preprocess_and_split_data_p02(df)


Train set features (X_train) shape: (51495, 80)
Test set features (X_test) shape: (12874, 80)
Train set target (y_train) shape: (51495,)
Test set target (y_test) shape: (12874,)


# Problem Three:
 Assume we need to forecast the number of passengers for different time periods in order to allocate a more accurate number of trains for the next day. (In this case, we are allowed to use information from previous time periods, but using the year and number of trains columns is still not permitted.)
