In [None]:
import pandas as pd
import numpy as np
%pip install xgboost

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.svm import SVR

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

Defaulting to user installation because normal site-packages is not writeable


In [20]:
# load the training and test data

train_data_path = "../data/processed/train.csv"
test_data_path = "../data/processed/test.csv"

train_df = pd.read_csv(train_data_path, parse_dates=["tpep_pickup_datetime"]).set_index("tpep_pickup_datetime")

test_df = pd.read_csv(test_data_path, parse_dates=["tpep_pickup_datetime"]).set_index("tpep_pickup_datetime")

train_df

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,total_pickups,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,187,161.0,4
2016-01-01 01:15:00,187.0,160.0,149.0,120.0,0,194,175.0,4
2016-01-01 01:30:00,194.0,187.0,160.0,149.0,0,180,177.0,4
2016-01-01 01:45:00,180.0,194.0,187.0,160.0,0,197,185.0,4
2016-01-01 02:00:00,197.0,180.0,194.0,187.0,0,185,185.0,4
...,...,...,...,...,...,...,...,...
2016-02-29 22:45:00,15.0,9.0,11.0,11.0,29,12,12.0,0
2016-02-29 23:00:00,12.0,15.0,9.0,11.0,29,17,14.0,0
2016-02-29 23:15:00,17.0,12.0,15.0,9.0,29,15,14.0,0
2016-02-29 23:30:00,15.0,17.0,12.0,15.0,29,15,15.0,0


In [21]:
# missing value in training data

train_df.isna().sum()

lag_1            0
lag_2            0
lag_3            0
lag_4            0
region           0
total_pickups    0
avg_pickups      0
day_of_week      0
dtype: int64

In [22]:
# missing values in the test data

test_df.isna().sum()

lag_1            0
lag_2            0
lag_3            0
lag_4            0
region           0
total_pickups    0
avg_pickups      0
day_of_week      0
dtype: int64

In [23]:
# make X_train and y_train

X_train = train_df.drop(columns=["total_pickups"])

y_train = train_df["total_pickups"]

In [24]:
X_train.head()

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,161.0,4
2016-01-01 01:15:00,187.0,160.0,149.0,120.0,0,175.0,4
2016-01-01 01:30:00,194.0,187.0,160.0,149.0,0,177.0,4
2016-01-01 01:45:00,180.0,194.0,187.0,160.0,0,185.0,4
2016-01-01 02:00:00,197.0,180.0,194.0,187.0,0,185.0,4


In [25]:
# make X_test and y_test

X_test = test_df.drop(columns=["total_pickups"])

y_test = test_df["total_pickups"]

In [26]:
X_test.head()

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-03-01 00:00:00,36.0,44.0,31.0,29.0,0,39.0,1
2016-03-01 00:15:00,41.0,36.0,44.0,31.0,0,37.0,1
2016-03-01 00:30:00,35.0,41.0,36.0,44.0,0,41.0,1
2016-03-01 00:45:00,47.0,35.0,41.0,36.0,0,38.0,1
2016-03-01 01:00:00,34.0,47.0,35.0,41.0,0,35.0,1


In [27]:
from sklearn import set_config

set_config(transform_output="pandas")

In [None]:
# encode the data

encoder = ColumnTransformer([
    ("ohe", OneHotEncoder(drop="first",sparse_output=False), ["region","day_of_week"])
], remainder="passthrough", n_jobs=-1,force_int_remainder_cols=False)

In [29]:
encoder

In [30]:
# encode the train and test data

X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [None]:
%pip install optuna
import optuna
import tqdm 

Defaulting to user installation because normal site-packages is not writeable


In [32]:
def objective(trial):

    # model name search space
    list_of_models = ["LR", "RIDGE",  "RF", "GBR", "XGBR"]
    model_name = trial.suggest_categorical("model_name", list_of_models)

    if model_name == "LR":
        model = LinearRegression()

    elif model_name == "RIDGE":
        ridge_alpha = trial.suggest_float("ridge_alpha",1,50)
        model = Ridge(alpha=ridge_alpha)

    elif model_name == "RF":
        n_estimators_rf = trial.suggest_int("n_estimators_rf",10,100,step=10)
        max_depth_rf = trial.suggest_int("max_depth_rf",3,10)
        model = RandomForestRegressor(n_estimators=n_estimators_rf, 
                                      max_depth=max_depth_rf, 
                                      random_state=42, n_jobs=-1)

    elif model_name == "GBR":
        n_estimators_gb = trial.suggest_int("n_estimators_gb",10,100,step=10)
        learning_rate_gb = trial.suggest_float("learning_rate_gb",1e-4,1e-1)
        model = GradientBoostingRegressor(n_estimators=n_estimators_gb, 
                                          learning_rate=learning_rate_gb,
                                         random_state=42)

    elif model_name == "XGBR":
        n_estimators_xgb = trial.suggest_int("n_estimators_xgb",10,100,step=10)
        learning_rate_xgb = trial.suggest_float("learning_rate_xgb",1e-4,1e-1)
        max_depth_xgb = trial.suggest_int("max_depth_xgb",3,10)
        model = XGBRegressor(n_estimators=n_estimators_xgb,
                            learning_rate=learning_rate_xgb,
                            max_depth=max_depth_xgb)

    # fit on the data
    model.fit(X_train_encoded,y_train)

    # get the predictions
    y_pred = model.predict(X_test_encoded)

    # calculate the loss
    loss = mean_absolute_percentage_error(y_test, y_pred)

    return loss

In [33]:
# create a study object
study = optuna.create_study(study_name="model_selection", direction="minimize")

[I 2025-02-28 20:05:50,509] A new study created in memory with name: model_selection


In [34]:
# optimize the objective function
study.optimize(func=objective, n_trials=50, n_jobs=-1)

[I 2025-02-28 20:05:51,363] Trial 5 finished with value: 0.07931857683364948 and parameters: {'model_name': 'RIDGE', 'ridge_alpha': 11.02717437859067}. Best is trial 5 with value: 0.07931857683364948.
[I 2025-02-28 20:05:51,419] Trial 1 finished with value: 0.07934790285463848 and parameters: {'model_name': 'LR'}. Best is trial 5 with value: 0.07931857683364948.
[I 2025-02-28 20:05:51,430] Trial 4 finished with value: 0.07930491376082585 and parameters: {'model_name': 'RIDGE', 'ridge_alpha': 16.7133764765578}. Best is trial 4 with value: 0.07930491376082585.
[I 2025-02-28 20:05:51,492] Trial 8 finished with value: 0.07934790285463848 and parameters: {'model_name': 'LR'}. Best is trial 4 with value: 0.07930491376082585.
[I 2025-02-28 20:05:51,512] Trial 9 finished with value: 0.07925548993853726 and parameters: {'model_name': 'RIDGE', 'ridge_alpha': 40.211406403122325}. Best is trial 9 with value: 0.07925548993853726.
[I 2025-02-28 20:05:51,568] Trial 10 finished with value: 0.079288871

In [35]:
# best value

study.best_value

0.07923766803194288

In [36]:
# best parameters

study.best_params

{'model_name': 'RIDGE', 'ridge_alpha': 49.936263310118676}

In [37]:
# model value counts

study.trials_dataframe()['params_model_name'].value_counts()

params_model_name
RIDGE    19
GBR      11
XGBR     11
RF        5
LR        4
Name: count, dtype: int64

In [38]:
from optuna.visualization import (
    plot_optimization_history, 
    plot_parallel_coordinate, 
    plot_param_importances
)

In [39]:
plot_optimization_history(study)

In [40]:
plot_parallel_coordinate(study, params=["model_name"])

In [41]:
plot_parallel_coordinate(study, params=["ridge_alpha"])

In [42]:
# train the linear regression model

lr = LinearRegression()

lr.fit(X_train_encoded, y_train)

# get predictions
y_pred_train = lr.predict(X_train_encoded) 
y_pred_test = lr.predict(X_test_encoded)

# loss

mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)

print("The training error is ", mape_train)
print("The test error is ", mape_test)

The training error is  0.08778013304566863
The test error is  0.07934790285463848


In [43]:
# train the ridge regression model

ridge = Ridge(alpha=50, random_state=42)

ridge.fit(X_train_encoded, y_train)

# get predictions
y_pred_train = ridge.predict(X_train_encoded) 
y_pred_test = ridge.predict(X_test_encoded)

# loss

mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)

print("The training error is ", mape_train)
print("The test error is ", mape_test)

The training error is  0.08755473541879122
The test error is  0.0792375565878739


In [44]:
def tune_ridge(trial):
    # hyperparameter space
    alpha = trial.suggest_float("alpha",30,100)
    
    # make the model object
    ridge = Ridge(alpha=alpha, random_state=42)
    
    # train the model
    ridge.fit(X_train_encoded, y_train)
    
    # get predictions
    y_pred = ridge.predict(X_test_encoded)
    
    # calculate loss
    loss = mean_absolute_percentage_error(y_test, y_pred)

    return loss
        

In [45]:
# create study

study = optuna.create_study(study_name="tune_model", direction="minimize")

[I 2025-02-28 20:06:40,525] A new study created in memory with name: tune_model


In [46]:
# optimize

study.optimize(func=tune_ridge, n_trials=100, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-02-28 20:06:40,956] Trial 1 finished with value: 0.07921031715203952 and parameters: {'alpha': 66.67705562301495}. Best is trial 1 with value: 0.07921031715203952.
[I 2025-02-28 20:06:41,054] Trial 0 finished with value: 0.0792428784329709 and parameters: {'alpha': 46.99358594033973}. Best is trial 1 with value: 0.07921031715203952.
[I 2025-02-28 20:06:41,461] Trial 5 finished with value: 0.07917688196523001 and parameters: {'alpha': 89.98730182618453}. Best is trial 5 with value: 0.07917688196523001.
[I 2025-02-28 20:06:41,472] Trial 3 finished with value: 0.07920240504599312 and parameters: {'alpha': 71.93693905423434}. Best is trial 5 with value: 0.07917688196523001.
[I 2025-02-28 20:06:41,542] Trial 2 finished with value: 0.07923652747402643 and parameters: {'alpha': 50.59079140250324}. Best is trial 5 with value: 0.07917688196523001.
[I 2025-02-28 20:06:41,566] Trial 12 finished with value: 0.07918597692361315 and parameters: {'alpha': 83.30717340687372}. Best is trial 5 w

In [47]:
# best parameters

study.best_params

{'alpha': 99.99248902029561}

In [48]:
# best value

study.best_value

0.07916407478061059

In [49]:
# train the ridge regression model

ridge = Ridge(alpha=100, random_state=42)

ridge.fit(X_train_encoded, y_train)

# get predictions
y_pred_train = ridge.predict(X_train_encoded) 
y_pred_test = ridge.predict(X_test_encoded)

# loss

mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)

print("The training error is ", mape_train)
print("The test error is ", mape_test)

The training error is  0.08739324289532295
The test error is  0.07916406556071182
