In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna

In [2]:
Ridership_df= pd.read_csv('data/Ridership.csv')
df= Ridership_df.copy()
df.sample(5)

Unnamed: 0,Year,Month,Day,Week Number,Corridor,Workday,Station,Period,Ridership,N_trains,Covid19
3646,2019,March,8,10,Corridor_7,y,Station_22,PM Peak,74,2,0
51995,2022,March,30,13,Corridor_7,y,Station_21,AM Peak,778,4,0
42055,2021,September,1,35,Corridor_4,y,Station_8,AM Peak,249,2,1
26843,2020,August,5,32,Corridor_2,y,Station_3,AM Peak,103,3,1
5011,2019,April,8,15,Corridor_3,y,Station_4,PM Peak,965,6,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         64369 non-null  int64 
 1   Month        64369 non-null  object
 2   Day          64369 non-null  int64 
 3   Week Number  64369 non-null  int64 
 4   Corridor     64369 non-null  object
 5   Workday      64369 non-null  object
 6   Station      64369 non-null  object
 7   Period       64369 non-null  object
 8   Ridership    64369 non-null  int64 
 9   N_trains     64369 non-null  int64 
 10  Covid19      64369 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 5.4+ MB


In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df.isnull().sum()

Year           0
Month          0
Day            0
Week Number    0
Corridor       0
Workday        0
Station        0
Period         0
Ridership      0
N_trains       0
Covid19        0
dtype: int64

In [6]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['Month_Num'] = df['Month'].map(month_mapping)

# Cyclical Day conversion
def convert_day_to_circle(day):
    angle = 2 * np.pi * (day - 1) / 31
    x = np.cos(angle)
    y = np.sin(angle)
    return x, y

df['day_x'], df['day_y'] = zip(*df['Day'].map(convert_day_to_circle))

# Cyclical Week conversion
def convert_week_to_circle(week):
    angle = 2 * np.pi * (week - 1) / 53
    x = np.cos(angle)
    y = np.sin(angle)
    return x, y

df['week_x'], df['week_y'] = zip(*df['Week Number'].map(convert_week_to_circle))


# For Workday, instead of LabelEncoder then OneHotEncoder, let's just let OneHotEncoder handle it directly
categorical_features = ['Month', 'Corridor', 'Workday', 'Station', 'Period']

numerical_features_to_pass_through = [
    'Year', 'N_trains', 'Covid19', 'Ridership',
    'Month_Num', 'day_x', 'day_y', 'week_x', 'week_y']



preprocessor = ColumnTransformer(
    transformers=[
        ('', OneHotEncoder(), categorical_features),
    ],remainder='passthrough')


df = preprocessor.fit_transform(df)
df = df.toarray()
feature_names_out = list(preprocessor.get_feature_names_out())
df = pd.DataFrame(df, columns=[item.split('__')[1] for item in feature_names_out])

df['COVID_Workday'] = df['Covid19'] * df['Workday_y']

In [7]:
columns_to_convert = [
    'Month_April', 'Month_August', 'Month_December', 'Month_February', 
    'Month_January', 'Month_July', 'Month_June', 'Month_March', 
    'Month_May', 'Month_November', 'Month_October', 'Month_September',
    'Corridor_Corridor_1', 'Corridor_Corridor_2', 'Corridor_Corridor_3', 
    'Corridor_Corridor_4', 'Corridor_Corridor_5', 'Corridor_Corridor_6', 'Corridor_Corridor_7',
    'Station_Station_1', 'Station_Station_10', 'Station_Station_11', 'Station_Station_12', 
    'Station_Station_13', 'Station_Station_14', 'Station_Station_15', 'Station_Station_16', 
    'Station_Station_17', 'Station_Station_18', 'Station_Station_19', 'Station_Station_2', 
    'Station_Station_20', 'Station_Station_21', 'Station_Station_22', 'Station_Station_23', 
    'Station_Station_24', 'Station_Station_25', 'Station_Station_26', 'Station_Station_27', 
    'Station_Station_28', 'Station_Station_29', 'Station_Station_3', 'Station_Station_30', 
    'Station_Station_31', 'Station_Station_32', 'Station_Station_33', 'Station_Station_34', 
    'Station_Station_35', 'Station_Station_36', 'Station_Station_37', 'Station_Station_38', 
    'Station_Station_39', 'Station_Station_4', 'Station_Station_40', 'Station_Station_41', 
    'Station_Station_42', 'Station_Station_43', 'Station_Station_44', 'Station_Station_45', 
    'Station_Station_5', 'Station_Station_6', 'Station_Station_7', 'Station_Station_8', 'Station_Station_9',
    'Period_AM Peak', 'Period_Evening', 'Period_Midday', 'Period_PM Peak', 'Period_Weekend/Holiday',
    'Day',
    'Week Number',
    'Workday_n', 'Workday_y',
    'Covid19','COVID_Workday',
    'Month_Num']

columns_to_convert_02=['Year', 'Ridership', 'N_trains']


df[columns_to_convert] = df[columns_to_convert].astype('uint8')
df[columns_to_convert_02] = df[columns_to_convert_02].astype('uint16')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 83 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Month_April             64369 non-null  uint8  
 1   Month_August            64369 non-null  uint8  
 2   Month_December          64369 non-null  uint8  
 3   Month_February          64369 non-null  uint8  
 4   Month_January           64369 non-null  uint8  
 5   Month_July              64369 non-null  uint8  
 6   Month_June              64369 non-null  uint8  
 7   Month_March             64369 non-null  uint8  
 8   Month_May               64369 non-null  uint8  
 9   Month_November          64369 non-null  uint8  
 10  Month_October           64369 non-null  uint8  
 11  Month_September         64369 non-null  uint8  
 12  Corridor_Corridor_1     64369 non-null  uint8  
 13  Corridor_Corridor_2     64369 non-null  uint8  
 14  Corridor_Corridor_3     64369 non-null

# spliting

In [9]:
X = df.drop(['Ridership', 'Year', 'N_trains'], axis=1)  
y = pd.DataFrame(df['Ridership'])

df_for_stratify = Ridership_df.copy()
stratify_key = df_for_stratify['Covid19'].astype(str) + '_' + \
               df_for_stratify['Workday'] + '_' + \
               df_for_stratify['Period'] + '_' + \
               df_for_stratify['Corridor']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=stratify_key 
)

## Problem One: 
Assume we want to find a general model for the number of passengers without looking at the number of passengers on previous days and only using the recorded information in the table for the same time period (except for the year and number of trains) to predict the number of required trains based on the number of passengers.

In [26]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'verbose': -1
}

# Train the model
model_p01 = lgb.train(params,
                 train_data,
                 num_boost_round=1000,
                 valid_sets=[test_data],
                 callbacks=[lgb.early_stopping(stopping_rounds=50)])

# Make predictions
y_pred = model_p01.predict(X_test)

# Evaluate
rmse_f_p01 = np.sqrt(mean_squared_error(y_test, y_pred))
r2_f_p01= r2_score(y_test, y_pred)

print(f"RMSE: {rmse_f_p01:.2f}")
print(f"R²: {r2_f_p01:.4f}")

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 825.099
RMSE: 825.10
R²: 0.8129


In [10]:

def objective(trial):
    # Define parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10),
        'verbose': -1,
        'feature_pre_filter': False  # Disable feature pre-filtering
    }

    # Convert data to LightGBM Dataset format inside the objective function
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # Train the model
    model = lgb.train(params,
                      train_data,
                      num_boost_round=1000,
                      valid_sets=[test_data],
                      callbacks=[lgb.early_stopping(stopping_rounds=50)])
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Return RMSE as the objective value to minimize
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Create a study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best parameters
best_params = study.best_params
print("Best parameters:", best_params)


[I 2025-07-21 02:56:42,652] A new study created in memory with name: no-name-72727980-4bb5-4e57-b266-a3b90fab66a9


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[655]	valid_0's rmse: 983.857


[I 2025-07-21 02:56:44,671] Trial 0 finished with value: 983.8571349781687 and parameters: {'num_leaves': 58, 'learning_rate': 0.2661990120612658, 'min_child_samples': 56, 'max_depth': 4, 'feature_fraction': 0.8360333006471041, 'bagging_fraction': 0.5103394541520616, 'bagging_freq': 6, 'lambda_l1': 5.591061258626268, 'lambda_l2': 8.857246617297712}. Best is trial 0 with value: 983.8571349781687.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1043.23


[I 2025-07-21 02:56:46,896] Trial 1 finished with value: 1043.2277685454872 and parameters: {'num_leaves': 81, 'learning_rate': 0.057583280963296644, 'min_child_samples': 99, 'max_depth': 4, 'feature_fraction': 0.5713756717175975, 'bagging_fraction': 0.7716852580905575, 'bagging_freq': 1, 'lambda_l1': 3.341962948158439, 'lambda_l2': 7.293504490264829}. Best is trial 0 with value: 983.8571349781687.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 943.184


[I 2025-07-21 02:56:53,562] Trial 2 finished with value: 943.1838133867976 and parameters: {'num_leaves': 89, 'learning_rate': 0.01046816636972897, 'min_child_samples': 5, 'max_depth': 8, 'feature_fraction': 0.716688169116225, 'bagging_fraction': 0.9766701903619178, 'bagging_freq': 4, 'lambda_l1': 9.24931866887565, 'lambda_l2': 3.9613382107264385}. Best is trial 2 with value: 943.1838133867976.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:56:56,138] Trial 3 finished with value: 1102.8935294787166 and parameters: {'num_leaves': 74, 'learning_rate': 0.08273220149772832, 'min_child_samples': 78, 'max_depth': 3, 'feature_fraction': 0.7374072803763847, 'bagging_fraction': 0.7783778174692266, 'bagging_freq': 2, 'lambda_l1': 3.636207046397465, 'lambda_l2': 1.2843664607580685}. Best is trial 2 with value: 943.1838133867976.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1102.89
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:56:57,751] Trial 4 finished with value: 861.5723877513592 and parameters: {'num_leaves': 60, 'learning_rate': 0.19171172649158505, 'min_child_samples': 14, 'max_depth': 12, 'feature_fraction': 0.7368131190567629, 'bagging_fraction': 0.6996534915511409, 'bagging_freq': 4, 'lambda_l1': 0.18547925646878904, 'lambda_l2': 3.335894479009104}. Best is trial 4 with value: 861.5723877513592.


Early stopping, best iteration is:
[285]	valid_0's rmse: 861.572
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[849]	valid_0's rmse: 903.439


[I 2025-07-21 02:57:00,674] Trial 5 finished with value: 903.4390124848439 and parameters: {'num_leaves': 22, 'learning_rate': 0.16723484317777218, 'min_child_samples': 17, 'max_depth': 5, 'feature_fraction': 0.9431351488966067, 'bagging_fraction': 0.7041540050957, 'bagging_freq': 10, 'lambda_l1': 8.458697669755091, 'lambda_l2': 3.1554937695219367}. Best is trial 4 with value: 861.5723877513592.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 928.94


[I 2025-07-21 02:57:06,894] Trial 6 finished with value: 928.9395072409113 and parameters: {'num_leaves': 97, 'learning_rate': 0.027621224497156212, 'min_child_samples': 79, 'max_depth': 9, 'feature_fraction': 0.8185151275235125, 'bagging_fraction': 0.6399666813569402, 'bagging_freq': 9, 'lambda_l1': 2.679577233159854, 'lambda_l2': 8.434920202790334}. Best is trial 4 with value: 861.5723877513592.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[453]	valid_0's rmse: 906.214


[I 2025-07-21 02:57:10,791] Trial 7 finished with value: 906.2136570835175 and parameters: {'num_leaves': 91, 'learning_rate': 0.20448781442627334, 'min_child_samples': 95, 'max_depth': 12, 'feature_fraction': 0.690661050733, 'bagging_fraction': 0.6130419007563181, 'bagging_freq': 4, 'lambda_l1': 1.7462256116546593, 'lambda_l2': 9.317700642097494}. Best is trial 4 with value: 861.5723877513592.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[729]	valid_0's rmse: 888.623


[I 2025-07-21 02:57:14,434] Trial 8 finished with value: 888.6230802811186 and parameters: {'num_leaves': 41, 'learning_rate': 0.1216954797054653, 'min_child_samples': 65, 'max_depth': 10, 'feature_fraction': 0.9407026751351559, 'bagging_fraction': 0.6156818055516668, 'bagging_freq': 5, 'lambda_l1': 8.508237927046865, 'lambda_l2': 1.4716025788494513}. Best is trial 4 with value: 861.5723877513592.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 862.653


[I 2025-07-21 02:57:19,045] Trial 9 finished with value: 862.6530461021206 and parameters: {'num_leaves': 47, 'learning_rate': 0.03343597149366443, 'min_child_samples': 18, 'max_depth': 10, 'feature_fraction': 0.781905088113543, 'bagging_fraction': 0.7606254550429518, 'bagging_freq': 3, 'lambda_l1': 9.644858819138845, 'lambda_l2': 1.9182278419105159}. Best is trial 4 with value: 861.5723877513592.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:57:20,748] Trial 10 finished with value: 849.8905788926279 and parameters: {'num_leaves': 66, 'learning_rate': 0.27401165139173655, 'min_child_samples': 31, 'max_depth': 12, 'feature_fraction': 0.5519726776041285, 'bagging_fraction': 0.9002177049964568, 'bagging_freq': 7, 'lambda_l1': 0.025670283621728324, 'lambda_l2': 5.582735587974376}. Best is trial 10 with value: 849.8905788926279.


Early stopping, best iteration is:
[285]	valid_0's rmse: 849.891
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:57:22,451] Trial 11 finished with value: 873.7752008810859 and parameters: {'num_leaves': 66, 'learning_rate': 0.29365757058749986, 'min_child_samples': 37, 'max_depth': 12, 'feature_fraction': 0.5351122682062671, 'bagging_fraction': 0.9139197657757654, 'bagging_freq': 7, 'lambda_l1': 0.45851284899171607, 'lambda_l2': 6.047881492180237}. Best is trial 10 with value: 849.8905788926279.


Early stopping, best iteration is:
[283]	valid_0's rmse: 873.775
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:57:24,355] Trial 12 finished with value: 858.2687453839886 and parameters: {'num_leaves': 55, 'learning_rate': 0.23266979384818148, 'min_child_samples': 34, 'max_depth': 12, 'feature_fraction': 0.6133566191763793, 'bagging_fraction': 0.855631399929505, 'bagging_freq': 7, 'lambda_l1': 0.34953387442188966, 'lambda_l2': 5.4017147352986195}. Best is trial 10 with value: 849.8905788926279.


Early stopping, best iteration is:
[374]	valid_0's rmse: 858.269
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[635]	valid_0's rmse: 883.762


[I 2025-07-21 02:57:27,348] Trial 13 finished with value: 883.7622730708085 and parameters: {'num_leaves': 47, 'learning_rate': 0.24004193088371797, 'min_child_samples': 34, 'max_depth': 6, 'feature_fraction': 0.6234779018537024, 'bagging_fraction': 0.863046746539553, 'bagging_freq': 8, 'lambda_l1': 5.688356956021298, 'lambda_l2': 5.442143028000854}. Best is trial 10 with value: 849.8905788926279.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[691]	valid_0's rmse: 875.678


[I 2025-07-21 02:57:29,660] Trial 14 finished with value: 875.6779679869738 and parameters: {'num_leaves': 31, 'learning_rate': 0.22944056069603486, 'min_child_samples': 36, 'max_depth': 10, 'feature_fraction': 0.5002620087492152, 'bagging_fraction': 0.8605877555544297, 'bagging_freq': 7, 'lambda_l1': 1.7169619592192809, 'lambda_l2': 6.331388581753094}. Best is trial 10 with value: 849.8905788926279.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[524]	valid_0's rmse: 843.805


[I 2025-07-21 02:57:32,515] Trial 15 finished with value: 843.8048339608916 and parameters: {'num_leaves': 71, 'learning_rate': 0.2892649222872754, 'min_child_samples': 48, 'max_depth': 11, 'feature_fraction': 0.634771214684498, 'bagging_fraction': 0.9734957652444538, 'bagging_freq': 8, 'lambda_l1': 0.08897241071292772, 'lambda_l2': 4.599697279766318}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[787]	valid_0's rmse: 850.137


[I 2025-07-21 02:57:36,659] Trial 16 finished with value: 850.1373464297582 and parameters: {'num_leaves': 70, 'learning_rate': 0.29801972053425463, 'min_child_samples': 50, 'max_depth': 7, 'feature_fraction': 0.6423537343920831, 'bagging_fraction': 0.9986553889709472, 'bagging_freq': 10, 'lambda_l1': 4.664998244939724, 'lambda_l2': 4.3312139140704184}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[453]	valid_0's rmse: 859.28


[I 2025-07-21 02:57:39,457] Trial 17 finished with value: 859.2801249894301 and parameters: {'num_leaves': 80, 'learning_rate': 0.2605252429837147, 'min_child_samples': 48, 'max_depth': 11, 'feature_fraction': 0.5699348051004169, 'bagging_fraction': 0.9223705087692728, 'bagging_freq': 8, 'lambda_l1': 1.4428313771043613, 'lambda_l2': 7.31790780353578}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 854.545


[I 2025-07-21 02:57:42,827] Trial 18 finished with value: 854.5452540774025 and parameters: {'num_leaves': 78, 'learning_rate': 0.13789828707721496, 'min_child_samples': 63, 'max_depth': 8, 'feature_fraction': 0.6539781066141234, 'bagging_fraction': 0.9318120759721166, 'bagging_freq': 0, 'lambda_l1': 6.628045622712361, 'lambda_l2': 2.5839833350017543}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[347]	valid_0's rmse: 864.407


[I 2025-07-21 02:57:45,269] Trial 19 finished with value: 864.4065742261042 and parameters: {'num_leaves': 66, 'learning_rate': 0.2702333748883794, 'min_child_samples': 26, 'max_depth': 11, 'feature_fraction': 0.5610869631763907, 'bagging_fraction': 0.8121511383524684, 'bagging_freq': 6, 'lambda_l1': 2.559544774054152, 'lambda_l2': 7.367046429943851}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[488]	valid_0's rmse: 850.398


[I 2025-07-21 02:57:48,237] Trial 20 finished with value: 850.3982510710329 and parameters: {'num_leaves': 85, 'learning_rate': 0.19583374219662536, 'min_child_samples': 44, 'max_depth': 9, 'feature_fraction': 0.6750187132089422, 'bagging_fraction': 0.9586684517663225, 'bagging_freq': 9, 'lambda_l1': 1.1136234797581726, 'lambda_l2': 4.631537851497487}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[566]	valid_0's rmse: 865.025


[I 2025-07-21 02:57:50,825] Trial 21 finished with value: 865.024893110191 and parameters: {'num_leaves': 70, 'learning_rate': 0.2880593758460317, 'min_child_samples': 54, 'max_depth': 7, 'feature_fraction': 0.6101147364323507, 'bagging_fraction': 0.9926870387805398, 'bagging_freq': 10, 'lambda_l1': 6.887768660987535, 'lambda_l2': 4.38738821511175}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[466]	valid_0's rmse: 871.298


[I 2025-07-21 02:57:53,171] Trial 22 finished with value: 871.2979657836858 and parameters: {'num_leaves': 69, 'learning_rate': 0.2954280258267808, 'min_child_samples': 44, 'max_depth': 7, 'feature_fraction': 0.5068595277928758, 'bagging_fraction': 0.9986012368826664, 'bagging_freq': 9, 'lambda_l1': 4.323880300898197, 'lambda_l2': 3.7566714015460594}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[981]	valid_0's rmse: 882.766


[I 2025-07-21 02:57:56,903] Trial 23 finished with value: 882.7658015520182 and parameters: {'num_leaves': 60, 'learning_rate': 0.2527122317561441, 'min_child_samples': 65, 'max_depth': 6, 'feature_fraction': 0.6477118591713412, 'bagging_fraction': 0.888207351334356, 'bagging_freq': 8, 'lambda_l1': 2.387028208342221, 'lambda_l2': 6.270834110370611}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:57:58,473] Trial 24 finished with value: 853.167978618532 and parameters: {'num_leaves': 50, 'learning_rate': 0.2804615540234675, 'min_child_samples': 26, 'max_depth': 11, 'feature_fraction': 0.5871590009574678, 'bagging_fraction': 0.9342930071385114, 'bagging_freq': 10, 'lambda_l1': 4.080164294021999, 'lambda_l2': 4.562614032660076}. Best is trial 15 with value: 843.8048339608916.


Early stopping, best iteration is:
[349]	valid_0's rmse: 853.168
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[448]	valid_0's rmse: 868.993


[I 2025-07-21 02:58:01,103] Trial 25 finished with value: 868.9929369754667 and parameters: {'num_leaves': 75, 'learning_rate': 0.22016245029361314, 'min_child_samples': 56, 'max_depth': 9, 'feature_fraction': 0.6937514639045946, 'bagging_fraction': 0.823555991796831, 'bagging_freq': 6, 'lambda_l1': 0.8642285295951438, 'lambda_l2': 0.3736283805477969}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[878]	valid_0's rmse: 873.661


[I 2025-07-21 02:58:04,445] Trial 26 finished with value: 873.6607218036038 and parameters: {'num_leaves': 66, 'learning_rate': 0.298399778443111, 'min_child_samples': 76, 'max_depth': 6, 'feature_fraction': 0.7782455934530353, 'bagging_fraction': 0.9573468120232611, 'bagging_freq': 9, 'lambda_l1': 0.04880978657202441, 'lambda_l2': 5.125078304264479}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:05,933] Trial 27 finished with value: 862.7156915217361 and parameters: {'num_leaves': 99, 'learning_rate': 0.25223294494542275, 'min_child_samples': 27, 'max_depth': 11, 'feature_fraction': 0.5432404462435332, 'bagging_fraction': 0.8956670760418394, 'bagging_freq': 8, 'lambda_l1': 5.1444935130362435, 'lambda_l2': 2.840984159019382}. Best is trial 15 with value: 843.8048339608916.


Early stopping, best iteration is:
[215]	valid_0's rmse: 862.716
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[973]	valid_0's rmse: 865.672


[I 2025-07-21 02:58:09,152] Trial 28 finished with value: 865.6717478902982 and parameters: {'num_leaves': 40, 'learning_rate': 0.1042957778317261, 'min_child_samples': 46, 'max_depth': 8, 'feature_fraction': 0.6425043226776426, 'bagging_fraction': 0.9588435525755111, 'bagging_freq': 7, 'lambda_l1': 2.098649247487905, 'lambda_l2': 5.7649396852140455}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 937.246


[I 2025-07-21 02:58:11,642] Trial 29 finished with value: 937.2464386622785 and parameters: {'num_leaves': 56, 'learning_rate': 0.1747367730286482, 'min_child_samples': 50, 'max_depth': 5, 'feature_fraction': 0.8446878442582597, 'bagging_fraction': 0.5368570098368879, 'bagging_freq': 5, 'lambda_l1': 6.769333233110119, 'lambda_l2': 6.80207195201157}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:13,199] Trial 30 finished with value: 891.5065828035705 and parameters: {'num_leaves': 61, 'learning_rate': 0.26613318790296026, 'min_child_samples': 61, 'max_depth': 10, 'feature_fraction': 0.5955327312964859, 'bagging_fraction': 0.8120547094989748, 'bagging_freq': 10, 'lambda_l1': 3.1138423563622997, 'lambda_l2': 8.264136175973665}. Best is trial 15 with value: 843.8048339608916.


Early stopping, best iteration is:
[280]	valid_0's rmse: 891.507
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[486]	valid_0's rmse: 849.116


[I 2025-07-21 02:58:16,123] Trial 31 finished with value: 849.1163788353568 and parameters: {'num_leaves': 86, 'learning_rate': 0.21621642273311642, 'min_child_samples': 41, 'max_depth': 9, 'feature_fraction': 0.674153195288135, 'bagging_fraction': 0.9585969361889566, 'bagging_freq': 9, 'lambda_l1': 0.945284234815812, 'lambda_l2': 4.663682681630995}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[479]	valid_0's rmse: 850.908


[I 2025-07-21 02:58:18,713] Trial 32 finished with value: 850.9084352475811 and parameters: {'num_leaves': 83, 'learning_rate': 0.27466889644969533, 'min_child_samples': 42, 'max_depth': 9, 'feature_fraction': 0.6510918753006529, 'bagging_fraction': 0.9615777768116683, 'bagging_freq': 9, 'lambda_l1': 0.9352908338838355, 'lambda_l2': 3.940057238613943}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[684]	valid_0's rmse: 852.951


[I 2025-07-21 02:58:22,329] Trial 33 finished with value: 852.9514362253278 and parameters: {'num_leaves': 88, 'learning_rate': 0.2129579866492445, 'min_child_samples': 40, 'max_depth': 7, 'feature_fraction': 0.6747034923249018, 'bagging_fraction': 0.9980839779261568, 'bagging_freq': 8, 'lambda_l1': 0.8944912987876171, 'lambda_l2': 4.8473063924088375}. Best is trial 15 with value: 843.8048339608916.


Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:24,219] Trial 34 finished with value: 843.8397330824913 and parameters: {'num_leaves': 73, 'learning_rate': 0.24688199091132257, 'min_child_samples': 29, 'max_depth': 11, 'feature_fraction': 0.708761414865754, 'bagging_fraction': 0.8943980743039872, 'bagging_freq': 10, 'lambda_l1': 5.918173423127503, 'lambda_l2': 4.141413303853034}. Best is trial 15 with value: 843.8048339608916.


Early stopping, best iteration is:
[275]	valid_0's rmse: 843.84
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:25,925] Trial 35 finished with value: 842.8389280560076 and parameters: {'num_leaves': 93, 'learning_rate': 0.245392873453174, 'min_child_samples': 30, 'max_depth': 11, 'feature_fraction': 0.7100021442013916, 'bagging_fraction': 0.9167559913609313, 'bagging_freq': 6, 'lambda_l1': 5.692828409904595, 'lambda_l2': 3.621145658263968}. Best is trial 35 with value: 842.8389280560076.


Early stopping, best iteration is:
[214]	valid_0's rmse: 842.839
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:27,008] Trial 36 finished with value: 838.6675091435462 and parameters: {'num_leaves': 93, 'learning_rate': 0.24351497340962985, 'min_child_samples': 6, 'max_depth': 11, 'feature_fraction': 0.7338151938118295, 'bagging_fraction': 0.8813701701599211, 'bagging_freq': 6, 'lambda_l1': 6.005108443531905, 'lambda_l2': 3.628766231966392}. Best is trial 36 with value: 838.6675091435462.


Early stopping, best iteration is:
[164]	valid_0's rmse: 838.668
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:28,173] Trial 37 finished with value: 845.8252341281382 and parameters: {'num_leaves': 92, 'learning_rate': 0.24341733214479963, 'min_child_samples': 7, 'max_depth': 11, 'feature_fraction': 0.7226290890858346, 'bagging_fraction': 0.8483973412598094, 'bagging_freq': 6, 'lambda_l1': 5.9267046426117265, 'lambda_l2': 2.367919248425615}. Best is trial 36 with value: 838.6675091435462.


Early stopping, best iteration is:
[181]	valid_0's rmse: 845.825
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:30,031] Trial 38 finished with value: 858.5660535556415 and parameters: {'num_leaves': 94, 'learning_rate': 0.17425049091105083, 'min_child_samples': 12, 'max_depth': 10, 'feature_fraction': 0.7598411139882839, 'bagging_fraction': 0.7209045339073406, 'bagging_freq': 5, 'lambda_l1': 6.311018640699918, 'lambda_l2': 3.267648111461025}. Best is trial 36 with value: 838.6675091435462.


Early stopping, best iteration is:
[252]	valid_0's rmse: 858.566
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:31,209] Trial 39 finished with value: 842.5622484983697 and parameters: {'num_leaves': 95, 'learning_rate': 0.2506277608886138, 'min_child_samples': 20, 'max_depth': 11, 'feature_fraction': 0.8637301579664416, 'bagging_fraction': 0.889337796341469, 'bagging_freq': 3, 'lambda_l1': 7.8565407407451815, 'lambda_l2': 3.6141911137553038}. Best is trial 36 with value: 838.6675091435462.


Early stopping, best iteration is:
[155]	valid_0's rmse: 842.562
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:32,659] Trial 40 finished with value: 848.6431832882404 and parameters: {'num_leaves': 95, 'learning_rate': 0.1922956760335177, 'min_child_samples': 17, 'max_depth': 12, 'feature_fraction': 0.8700766716985966, 'bagging_fraction': 0.8320098062044938, 'bagging_freq': 2, 'lambda_l1': 7.663801596805344, 'lambda_l2': 3.4994473521605896}. Best is trial 36 with value: 838.6675091435462.


Early stopping, best iteration is:
[167]	valid_0's rmse: 848.643
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:34,097] Trial 41 finished with value: 837.2056886853657 and parameters: {'num_leaves': 76, 'learning_rate': 0.250849406682031, 'min_child_samples': 21, 'max_depth': 11, 'feature_fraction': 0.9767211508156621, 'bagging_fraction': 0.8818308225567638, 'bagging_freq': 3, 'lambda_l1': 7.378377417340787, 'lambda_l2': 4.0173440796917985}. Best is trial 41 with value: 837.2056886853657.


Early stopping, best iteration is:
[214]	valid_0's rmse: 837.206
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:35,661] Trial 42 finished with value: 846.9319497678129 and parameters: {'num_leaves': 98, 'learning_rate': 0.22670562958926996, 'min_child_samples': 21, 'max_depth': 10, 'feature_fraction': 0.9742277945248129, 'bagging_fraction': 0.7918368358181964, 'bagging_freq': 3, 'lambda_l1': 7.423958695947416, 'lambda_l2': 2.9599001803837597}. Best is trial 41 with value: 837.2056886853657.


Early stopping, best iteration is:
[189]	valid_0's rmse: 846.932
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:36,729] Trial 43 finished with value: 837.1862592817336 and parameters: {'num_leaves': 90, 'learning_rate': 0.261836276253665, 'min_child_samples': 9, 'max_depth': 11, 'feature_fraction': 0.9052160300960379, 'bagging_fraction': 0.8767748559147107, 'bagging_freq': 3, 'lambda_l1': 8.612949417644426, 'lambda_l2': 2.135518064261738}. Best is trial 43 with value: 837.1862592817336.


Early stopping, best iteration is:
[142]	valid_0's rmse: 837.186
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:37,670] Trial 44 finished with value: 836.1760266027898 and parameters: {'num_leaves': 89, 'learning_rate': 0.2578803143180711, 'min_child_samples': 5, 'max_depth': 12, 'feature_fraction': 0.9018788274497007, 'bagging_fraction': 0.8793640652943012, 'bagging_freq': 3, 'lambda_l1': 8.805271349038133, 'lambda_l2': 1.2329278373918835}. Best is trial 44 with value: 836.1760266027898.


Early stopping, best iteration is:
[125]	valid_0's rmse: 836.176
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:38,641] Trial 45 finished with value: 828.4729631192948 and parameters: {'num_leaves': 89, 'learning_rate': 0.20390086009875197, 'min_child_samples': 6, 'max_depth': 12, 'feature_fraction': 0.8962738465783344, 'bagging_fraction': 0.8743303157965518, 'bagging_freq': 3, 'lambda_l1': 8.584710446458965, 'lambda_l2': 0.6685415475057028}. Best is trial 45 with value: 828.4729631192948.


Early stopping, best iteration is:
[127]	valid_0's rmse: 828.473
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:39,833] Trial 46 finished with value: 826.0919618221408 and parameters: {'num_leaves': 89, 'learning_rate': 0.20404032634999653, 'min_child_samples': 5, 'max_depth': 12, 'feature_fraction': 0.9111908365749342, 'bagging_fraction': 0.8685835230778026, 'bagging_freq': 2, 'lambda_l1': 9.119218773197982, 'lambda_l2': 0.7001947797129606}. Best is trial 46 with value: 826.0919618221408.


Early stopping, best iteration is:
[147]	valid_0's rmse: 826.092
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:41,065] Trial 47 finished with value: 846.1516538051301 and parameters: {'num_leaves': 88, 'learning_rate': 0.16197465142086465, 'min_child_samples': 11, 'max_depth': 12, 'feature_fraction': 0.9136320554813635, 'bagging_fraction': 0.7773613500300914, 'bagging_freq': 2, 'lambda_l1': 8.88421197127932, 'lambda_l2': 0.005994214883159654}. Best is trial 46 with value: 826.0919618221408.


Early stopping, best iteration is:
[152]	valid_0's rmse: 846.152
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:42,138] Trial 48 finished with value: 847.6034507757207 and parameters: {'num_leaves': 78, 'learning_rate': 0.2053498891104058, 'min_child_samples': 9, 'max_depth': 12, 'feature_fraction': 0.9896234768875556, 'bagging_fraction': 0.7470685377893002, 'bagging_freq': 1, 'lambda_l1': 9.661167163936657, 'lambda_l2': 1.1833875118496695}. Best is trial 46 with value: 826.0919618221408.


Early stopping, best iteration is:
[139]	valid_0's rmse: 847.603
Training until validation scores don't improve for 50 rounds


[I 2025-07-21 02:58:43,309] Trial 49 finished with value: 826.8852339347029 and parameters: {'num_leaves': 82, 'learning_rate': 0.14988409244644552, 'min_child_samples': 5, 'max_depth': 12, 'feature_fraction': 0.9054527736795613, 'bagging_fraction': 0.8666852990946773, 'bagging_freq': 4, 'lambda_l1': 9.21690540293889, 'lambda_l2': 0.8250564187001804}. Best is trial 46 with value: 826.0919618221408.


Early stopping, best iteration is:
[201]	valid_0's rmse: 826.885
Best parameters: {'num_leaves': 89, 'learning_rate': 0.20404032634999653, 'min_child_samples': 5, 'max_depth': 12, 'feature_fraction': 0.9111908365749342, 'bagging_fraction': 0.8685835230778026, 'bagging_freq': 2, 'lambda_l1': 9.119218773197982, 'lambda_l2': 0.7001947797129606}


In [19]:
# Train final model with all data and best parameters
lgb_model_p01 = lgb.train(best_params,
                       lgb.Dataset(pd.concat([X_train, X_test]), 
                                 label=pd.concat([y_train, y_test])),
                       num_boost_round=1000)

y_pred_01 = lgb_model_p01.predict(X_test)

# Evaluate
rmse_p01 = np.sqrt(mean_squared_error(y_test, y_pred_01 ))
r2_p01 = r2_score(y_test, y_pred_01 )

print(f"RMSE: {rmse_p01 :.2f}")
print(f"R²: {r2_p01 :.4f}")

RMSE: 508.88
R²: 0.9288


In [24]:
y_pred_train_p01 = pd.DataFrame(y_pred_01, columns=['Predicted_Passengers']).astype(int)

train_capacity = 600  
y_pred_train_p01['Predicted_Trains'] = np.ceil(y_pred_train_p01['Predicted_Passengers'] / train_capacity).astype(int)

# Add a Safety/Comfort Buffer (e.g., 5% more capacity)
# This allocates slightly more trains than the absolute minimum required.
safety_buffer_percentage = 0.05 
y_pred_train_p01['Buffered_Predicted_Trains'] = np.ceil(
    y_pred_train_p01['Predicted_Passengers'] * (1 + safety_buffer_percentage) / train_capacity).astype(int)

# 4. Ensure a Minimum Number of Trains (e.g., at least 1 train, even for 0 passengers)
minimum_trains_required = 1
y_pred_train_p01['Final_Predicted_Trains'] = y_pred_train_p01['Buffered_Predicted_Trains'].apply(
    lambda x: max(x, minimum_trains_required))

y_pred_train_p01.sample(10)

Unnamed: 0,Predicted_Passengers,Predicted_Trains,Buffered_Predicted_Trains,Final_Predicted_Trains
11431,184,1,1,1
6038,207,1,1,1
1118,5362,9,10,10
9304,-162,0,0,1
1367,340,1,1,1
12594,281,1,1,1
6748,238,1,1,1
8771,55,1,1,1
5212,1072,2,2,2
149,1745,3,4,4
