In [17]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [2]:
Ridership_df= pd.read_csv('data/Ridership.csv')
df= Ridership_df.copy()
df.sample(5)

Unnamed: 0,Year,Month,Day,Week Number,Corridor,Workday,Station,Period,Ridership,N_trains,Covid19
11186,2019,August,20,34,Corridor_1,y,Station_1,Midday,899,1,0
31823,2020,December,8,50,Corridor_4,y,Station_3,Midday,87,6,1
9262,2019,July,9,28,Corridor_6,y,Station_20,AM Peak,3342,4,0
47016,2021,December,7,49,Corridor_7,y,Station_3,Evening,324,7,1
14528,2019,October,25,43,Corridor_4,y,Station_8,AM Peak,1102,2,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         64369 non-null  int64 
 1   Month        64369 non-null  object
 2   Day          64369 non-null  int64 
 3   Week Number  64369 non-null  int64 
 4   Corridor     64369 non-null  object
 5   Workday      64369 non-null  object
 6   Station      64369 non-null  object
 7   Period       64369 non-null  object
 8   Ridership    64369 non-null  int64 
 9   N_trains     64369 non-null  int64 
 10  Covid19      64369 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 5.4+ MB


In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df.isnull().sum()

Year           0
Month          0
Day            0
Week Number    0
Corridor       0
Workday        0
Station        0
Period         0
Ridership      0
N_trains       0
Covid19        0
dtype: int64

In [6]:
month_mapping = {'January': 1, 'February':2, 'March': 3,
                 'April': 4, 'May': 5, 'June': 6, 'July': 7,
                 'August': 8, 'September': 9, 'October': 10, 
                 'November': 11, 'December': 12}


df['Month_Num'] = df['Month'].map(month_mapping)

In [7]:
def convert_day_to_circle(day):
    angle = 2 * np.pi * (day - 1) / 31  # Calculate the angle based on the day
    x = np.cos(angle)  # Compute the x-coordinate on the unit circle
    y = np.sin(angle)  # Compute the y-coordinate on the unit circle
    return x, y

def convert_week_to_circle(week):
    angle = 2 * np.pi * (week - 1) / 53  # Calculate the angle based on the week
    x = np.cos(angle)  # Compute the x-coordinate on the unit circle
    y = np.sin(angle)  # Compute the y-coordinate on the unit circle
    return x, y

df['day_x'], df['day_y'] = zip(*df['Day'].map(convert_day_to_circle))
df['week_x'], df['week_y'] = zip(*df['Week Number'].map(convert_week_to_circle))

In [8]:
categorical_features = ['Month', 'Corridor', 'Workday', 'Station', 'Period']

preprocessor = ColumnTransformer(
        transformers=[('', OneHotEncoder(), categorical_features)],
        remainder='passthrough')

df = preprocessor.fit_transform(df)
df = df.toarray()
feature_names_out = list(preprocessor.get_feature_names_out())
df= pd.DataFrame(df, columns=[item.split('__')[1] for item in feature_names_out])

In [9]:
df['COVID_Workday'] = df['Covid19'] * df['Workday_y']
columns_to_convert = [
    'Month_April', 'Month_August', 'Month_December', 'Month_February', 
    'Month_January', 'Month_July', 'Month_June', 'Month_March', 
    'Month_May', 'Month_November', 'Month_October', 'Month_September',
    'Corridor_Corridor_1', 'Corridor_Corridor_2', 'Corridor_Corridor_3', 
    'Corridor_Corridor_4', 'Corridor_Corridor_5', 'Corridor_Corridor_6', 'Corridor_Corridor_7',
    'Station_Station_1', 'Station_Station_10', 'Station_Station_11', 'Station_Station_12', 
    'Station_Station_13', 'Station_Station_14', 'Station_Station_15', 'Station_Station_16', 
    'Station_Station_17', 'Station_Station_18', 'Station_Station_19', 'Station_Station_2', 
    'Station_Station_20', 'Station_Station_21', 'Station_Station_22', 'Station_Station_23', 
    'Station_Station_24', 'Station_Station_25', 'Station_Station_26', 'Station_Station_27', 
    'Station_Station_28', 'Station_Station_29', 'Station_Station_3', 'Station_Station_30', 
    'Station_Station_31', 'Station_Station_32', 'Station_Station_33', 'Station_Station_34', 
    'Station_Station_35', 'Station_Station_36', 'Station_Station_37', 'Station_Station_38', 
    'Station_Station_39', 'Station_Station_4', 'Station_Station_40', 'Station_Station_41', 
    'Station_Station_42', 'Station_Station_43', 'Station_Station_44', 'Station_Station_45', 
    'Station_Station_5', 'Station_Station_6', 'Station_Station_7', 'Station_Station_8', 'Station_Station_9',
    'Period_AM Peak', 'Period_Evening', 'Period_Midday', 'Period_PM Peak', 'Period_Weekend/Holiday',
    'Day',
    'Week Number',
    'Workday_n', 'Workday_y',
    'Covid19','COVID_Workday',
    'Month_Num']

columns_to_convert_02=['Year', 'Ridership', 'N_trains']


df[columns_to_convert] = df[columns_to_convert].astype('uint8')
df[columns_to_convert_02] = df[columns_to_convert_02].astype('uint16')


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64369 entries, 0 to 64368
Data columns (total 83 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Month_April             64369 non-null  uint8  
 1   Month_August            64369 non-null  uint8  
 2   Month_December          64369 non-null  uint8  
 3   Month_February          64369 non-null  uint8  
 4   Month_January           64369 non-null  uint8  
 5   Month_July              64369 non-null  uint8  
 6   Month_June              64369 non-null  uint8  
 7   Month_March             64369 non-null  uint8  
 8   Month_May               64369 non-null  uint8  
 9   Month_November          64369 non-null  uint8  
 10  Month_October           64369 non-null  uint8  
 11  Month_September         64369 non-null  uint8  
 12  Corridor_Corridor_1     64369 non-null  uint8  
 13  Corridor_Corridor_2     64369 non-null  uint8  
 14  Corridor_Corridor_3     64369 non-null

In [11]:
df.sample(10)

Unnamed: 0,Month_April,Month_August,Month_December,Month_February,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,...,Week Number,Ridership,N_trains,Covid19,Month_Num,day_x,day_y,week_x,week_y,COVID_Workday
19437,0,0,0,0,1,0,0,0,0,0,...,5,918,1,1,1,0.820763,-0.571268,0.889657,0.456629,1
36005,0,0,0,0,0,0,0,1,0,0,...,12,58,1,1,3,0.347305,-0.937752,0.263587,0.964636,1
60540,0,0,0,0,0,0,0,0,0,0,...,41,781,7,0,10,-0.440394,0.897805,0.029633,-0.999561,0
24378,0,0,0,0,0,0,0,0,1,0,...,21,157,4,1,5,-0.440394,-0.897805,-0.717507,0.696551,1
34315,0,0,0,1,0,0,0,0,0,0,...,6,38,1,1,2,-0.050649,0.998717,0.829406,0.558647,1
14029,0,0,0,0,0,0,0,0,0,0,...,42,20,1,0,10,-0.994869,0.101168,0.147647,-0.98904,0
34951,0,0,0,1,0,0,0,0,0,0,...,8,50,1,1,2,0.347305,-0.937752,0.674983,0.737833,1
43949,0,0,0,0,0,0,0,0,0,0,...,40,1238,24,1,10,0.151428,0.988468,-0.088796,-0.99605,1
56328,0,0,0,0,0,1,0,0,0,0,...,28,333,5,0,7,-0.440394,0.897805,-0.998244,-0.059241,0
23682,0,0,0,0,0,0,0,0,1,0,...,18,245,19,1,5,0.918958,0.394356,-0.430065,0.902798,0


In [12]:
def split_the_timeseries_data(Ridership_df, df):
    # Create empty lists to store the train-test split data
    X_train = []
    X_test = []
    y_train = []
    y_test = []

    # Iterate over each unique station
    for station in Ridership_df['Station'].unique():
        # Get the data for the current station
        station_data = df[df['Station_'+station] == 1]
        target_data = Ridership_df.loc[df['Station_'+station] == 1, 'Ridership']

        # Determine the number of samples to include in the test set
        test_size = int(len(station_data) * 0.2)

        # Split the data into train and test sets
        station_X_train = station_data[:-test_size]
        station_X_test = station_data[-test_size:]
        station_y_train = target_data[:-test_size]
        station_y_test = target_data[-test_size:]

        # Append the station-specific data to the overall train-test split
        X_train.append(station_X_train)
        X_test.append(station_X_test)
        y_train.append(station_y_train)
        y_test.append(station_y_test)

    # Concatenate the train-test split data
    X_train = pd.concat(X_train)
    X_test = pd.concat(X_test)
    y_train = pd.concat(y_train)
    y_test = pd.concat(y_test)

    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_the_timeseries_data(Ridership_df, df)

## Problem One: 
Assume we want to find a general model for the number of passengers without looking at the number of passengers on previous days and only using the recorded information in the table for the same time period (except for the year and number of trains) to predict the number of required trains based on the number of passengers.

In [13]:
X_train_p01= X_train.drop(['Ridership','Year','N_trains'],axis=1)
X_test_p01= X_test.drop(['Ridership','Year','N_trains'],axis=1)

y_train_p01= y_train.copy()
y_test_p01= y_test.copy()

In [None]:
param_grid_xgb = {
    'n_estimators': np.arange(50, 1000, 10, dtype=int),   
    'learning_rate': np.arange(0.01, 0.3, 0.01),
    'max_depth':  np.arange(3, 10, 1),
    'subsample':  np.arange(0.01, 1, 0.01),
    'colsample_bytree':  np.arange(0.5, 1, 0.1),
    }

grid_xgb = GridSearchCV(XGBRegressor(objective='reg:squarederror'), param_grid_xgb)

grid_xgb.fit(X_train_p01, y_train_p01)

print(grid_xgb.best_params_)
print(grid_xgb.best_estimator_)

In [15]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Ensure all feature columns are numeric (float32 for XGBoost compatibility)
# This is a good practice, especially after one-hot encoding which can introduce object types
X_train_p01 = X_train_p01.astype(float)
X_test_p01 = X_test_p01.astype(float)

# Initialize the XGBoost Regressor model
# You can tune these parameters for better performance
xgb_model = XGBRegressor(objective='reg:squarederror', # Objective for regression tasks
                         n_estimators=1000,           # Number of boosting rounds
                         learning_rate=0.05,          # Step size shrinkage
                         max_depth=6,                 # Maximum depth of a tree
                         subsample=0.7,               # Subsample ratio of the training instance
                         colsample_bytree=0.7,        # Subsample ratio of columns when constructing each tree
                         random_state=42,
                         n_jobs=-1)                   # Use all available CPU cores

# Train the model
print("Training the XGBoost Regressor model...")
xgb_model.fit(X_train_p01, y_train_p01)
print("Model training complete.")

# Make predictions on the test set
y_pred_p01 = xgb_model.predict(X_test_p01)

# Evaluate the model performance
rmse = np.sqrt(mean_squared_error(y_test_p01, y_pred_p01))
r2 = r2_score(y_test_p01, y_pred_p01)

print(f"\nModel Performance Evaluation:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")

# You can also inspect feature importance
# print("\nFeature Importances:")
# feature_importances = pd.Series(xgb_model.feature_importances_, index=X_train_p01.columns)
# print(feature_importances.sort_values(ascending=False))

In [14]:
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.svm import SVR
# from sklearn.neural_network import MLPRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# from math import sqrt
# import numpy as np

# # Dictionary to store model performance
# model_performance = {}

# # Initialize models
# models = {
#     'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
#     'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
#     'Linear Regression': LinearRegression(),
#     'Ridge Regression': Ridge(alpha=1.0),
#     'Support Vector Regression': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),
#     'Neural Network': MLPRegressor(hidden_layer_sizes=(100,50), 
#                                   max_iter=1000, 
#                                   random_state=42)
# }

# # Train and evaluate each model
# for name, model in models.items():
#     print(f"Training {name}...")
#     model.fit(X_train_p01, y_train_p01)
    
#     # Predictions
#     y_pred = model.predict(X_test_p01)
    
#     # Calculate metrics
#     rmse = sqrt(mean_squared_error(y_test_p01, y_pred))
#     r2 = r2_score(y_test_p01, y_pred)
    
#     # Store performance
#     model_performance[name] = {
#         'RMSE': rmse,
#         'R2': r2
#     }
    
#     print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.4f}\n")

# # Find the best performing model based on RMSE
# best_model_name = min(model_performance.items(), key=lambda x: x[1]['RMSE'])[0]
# print(f"\nBest performing model: {best_model_name}")
# print(f"RMSE: {model_performance[best_model_name]['RMSE']:.2f}")
# print(f"R²: {model_performance[best_model_name]['R2']:.4f}")

# # Feature importance for tree-based models
# if hasattr(models[best_model_name], 'feature_importances_'):
#     print("\nFeature Importances:")
#     importances = models[best_model_name].feature_importances_
#     features = X_train_p01.columns
#     importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
#     importance_df = importance_df.sort_values('Importance', ascending=False)
#     print(importance_df.head(10))