In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,12,0,0,0,0,0,4,0,0,0,...,0,6,0,0,0,0,0,2024-01-29 00:00:00,HB101,2
1,10,0,0,0,0,0,6,0,0,0,...,0,11,0,0,0,0,0,2024-01-29 12:00:00,HB101,19
2,0,0,0,0,0,0,16,0,0,0,...,0,15,0,0,0,0,0,2024-01-30 00:00:00,HB101,1
3,29,0,0,0,0,0,11,0,0,0,...,0,9,0,0,0,0,0,2024-01-30 12:00:00,HB101,30
4,0,0,0,0,0,0,19,0,0,0,...,0,13,0,0,0,0,0,2024-01-31 00:00:00,HB101,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56103,7,0,0,0,0,0,3,0,0,0,...,0,7,0,0,0,0,0,2024-12-29 12:00:00,JC116,12
56104,0,0,0,0,0,0,7,0,0,0,...,0,8,0,0,0,0,0,2024-12-30 00:00:00,JC116,2
56105,19,0,0,0,0,0,16,0,0,0,...,0,3,0,0,0,0,0,2024-12-30 12:00:00,JC116,26
56106,1,0,0,0,0,0,5,0,0,0,...,0,8,0,0,0,0,0,2024-12-31 00:00:00,JC116,0


In [4]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(35856, 674)
(35856,)
(20252, 674)
(20252,)


In [5]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [6]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()


In [8]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)
X_train
pipeline.fit(X_train,y_train)\



Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,12,0,0,0,0,0,4,0,0,0,...,0,0,6,0,0,0,0,0,2024-01-29 00:00:00,HB101
1,10,0,0,0,0,0,6,0,0,0,...,0,0,11,0,0,0,0,0,2024-01-29 12:00:00,HB101
2,0,0,0,0,0,0,16,0,0,0,...,0,0,15,0,0,0,0,0,2024-01-30 00:00:00,HB101
3,29,0,0,0,0,0,11,0,0,0,...,0,0,9,0,0,0,0,0,2024-01-30 12:00:00,HB101
4,0,0,0,0,0,0,19,0,0,0,...,0,0,13,0,0,0,0,0,2024-01-31 00:00:00,HB101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35851,30,0,0,0,0,0,39,0,0,0,...,0,0,13,0,0,0,0,0,2024-08-29 12:00:00,JC116
35852,1,0,0,0,0,0,7,0,0,0,...,0,0,25,0,0,0,0,0,2024-08-30 00:00:00,JC116
35853,42,0,0,0,0,0,18,0,0,0,...,0,0,5,0,0,0,0,0,2024-08-30 12:00:00,JC116
35854,9,0,0,0,0,0,14,0,0,0,...,0,0,25,0,0,0,0,0,2024-08-31 00:00:00,JC116


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10982
[LightGBM] [Info] Number of data points in the train set: 35856, number of used features: 114
[LightGBM] [Info] Start training from score 7.468178


In [9]:

import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# Define the parameter grid for LGBMRegressor
param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
    # "lgbmregressor__max_depth": [-1, 10, 20, 30],
    # "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    # "lgbmregressor__n_estimators": [100, 200, 500, 1000],
    # "lgbmregressor__min_child_samples": [10, 20, 30, 50],
    # "lgbmregressor__subsample": [0.6, 0.8, 1.0],
    # "lgbmregressor__colsample_bytree": [0.6, 0.8, 1.0],
    # "lgbmregressor__reg_alpha": [0, 0.1, 0.5, 1.0],
    # "lgbmregressor__reg_lambda": [0, 0.1, 0.5, 1.0],
    # "lgbmregressor__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    # "lgbmregressor__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    # "lgbmregressor__bagging_freq": [1, 5, 10],
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,  # Number of parameter settings sampled
    scoring="neg_mean_absolute_error",  # Use MAE as the scoring metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)



Fitting 3 folds for each of 4 candidates, totalling 12 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9262
[LightGBM] [Info] Number of data points in the train set: 23904, number of used features: 114
[LightGBM] [Info] Start training from score 6.411898
[CV] END ........................lgbmregressor__num_leaves=2; total time=   0.3s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10970
[LightGBM] [Info] Number of data points in the train set: 23904, number of used features: 114
[LightGBM] [Info] Start training from score 8.241424
[CV] END ........................lgbmregressor__num_leaves=2; total time=   0.2s
[LightGBM] [Info] Auto-choo

Best Parameters: {'lgbmregressor__num_leaves': 50}
Best Score (Negative MAE): -2.4976047909169337


In [10]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

Test Set MAE: 2.572529041000948


In [11]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test, "LGBMRegressor_with_Hyperparameter_50leaves", "mean_absolute_error", score=mae)

True

INFO:src.experiment_utils:MLflow tracking URI set to: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Accessing as ajitkumarsenthil5
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/repos/ajitkumarsenthil5/citibike_prediciton_aml "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Initialized MLflow to track repo "ajitkumarsenthil5/citibike_prediciton_aml"


INFO:dagshub:Repository ajitkumarsenthil5/citibike_prediciton_aml initialized!
INFO:src.experiment_utils:Dagshub initialized with MLflow integration.
INFO:src.experiment_utils:Using MLflow tracking URI: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_with_Hyperparameter_50leaves
INFO:src.experiment_utils:Logged mean_absolute_error: 2.572529041000948
INFO:src.experiment_utils:Model signature inferred.
Registered model 'Pipeline' already exists. Creating a new version of this model...
2025/05/10 14:37:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 2
Created version '2' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged and registered as: Pipeline


🏃 View run omniscient-grub-921 at: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow/#/experiments/4/runs/3a3e0484bd834c26965ff79ea151b096
🧪 View experiment at: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow/#/experiments/4


<mlflow.models.model.ModelInfo at 0x3a43acc10>