In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,12,0,0,0,0,0,4,0,0,0,...,0,6,0,0,0,0,0,2024-01-29 00:00:00,HB101,2
1,10,0,0,0,0,0,6,0,0,0,...,0,11,0,0,0,0,0,2024-01-29 12:00:00,HB101,19
2,0,0,0,0,0,0,16,0,0,0,...,0,15,0,0,0,0,0,2024-01-30 00:00:00,HB101,1
3,29,0,0,0,0,0,11,0,0,0,...,0,9,0,0,0,0,0,2024-01-30 12:00:00,HB101,30
4,0,0,0,0,0,0,19,0,0,0,...,0,13,0,0,0,0,0,2024-01-31 00:00:00,HB101,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56103,7,0,0,0,0,0,3,0,0,0,...,0,7,0,0,0,0,0,2024-12-29 12:00:00,JC116,12
56104,0,0,0,0,0,0,7,0,0,0,...,0,8,0,0,0,0,0,2024-12-30 00:00:00,JC116,2
56105,19,0,0,0,0,0,16,0,0,0,...,0,3,0,0,0,0,0,2024-12-30 12:00:00,JC116,26
56106,1,0,0,0,0,0,5,0,0,0,...,0,8,0,0,0,0,0,2024-12-31 00:00:00,JC116,0


In [4]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(35856, 674)
(35856,)
(20252, 674)
(20252,)


In [6]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

class PCA_for_trides(BaseEstimator, RegressorMixin):
    def __init__(self, use_pca=False, n_components=10, random_state=42):
        """
        Args:
            use_pca: whether to apply PCA or just use raw features
            n_components: number of PCA components (only used if use_pca=True)
        """
        self.use_pca = use_pca
        self.n_components = n_components
        self.random_state = random_state
        
        # Internal objects
        self.scaler = None
        self.pca = None
        self.model = None
        self.feature_cols = None

    def fit(self, X, y):
       
        self.feature_cols = [col for col in X.columns if col.startswith("rides_t-")]
        X_filtered = X[self.feature_cols]
        
        # Scale
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_filtered)

        # Optional PCA
        if self.use_pca:
            self.pca = PCA(n_components=self.n_components, random_state=self.random_state)
            X_transformed = self.pca.fit_transform(X_scaled)
        else:
            X_transformed = X_scaled

        # Train model
        self.model = RandomForestRegressor(random_state=self.random_state)
        self.model.fit(X_transformed, y)
        return self

    def predict(self, X):
        # Expect full DataFrame
        X_filtered = X[self.feature_cols]
        X_scaled = self.scaler.transform(X_filtered)

        if self.use_pca:
            X_transformed = self.pca.transform(X_scaled)
        else:
            X_transformed = X_scaled

        return self.model.predict(X_transformed)


In [10]:
model = PCA_for_trides(use_pca = False)
model.fit(X_train,y_train)

In [11]:
y_pred = model.predict(X_test)
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")


Test MAE: 2.65


In [15]:
model = PCA_for_trides(use_pca=True)
model.fit(X_train,y_train)


y_pred = model.predict(X_test)

from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")

Test MAE: 3.10


In [16]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
sys.path
load_dotenv() 
log_model_to_mlflow(model, X_test, experiment_name="RandomRegressor_feature_reduced_without_pca", metric_name="mean_absolute_error", score=test_mae)

['/Library/Frameworks/Python.framework/Versions/3.9/lib/python39.zip',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ajit/Desktop/Citibike_prediction_aml/Final_proj_Aml/lib/python3.9/site-packages',
 '/Users/ajit/Desktop/Citibike_prediction_aml']

True

INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Accessing as ajitkumarsenthil5
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/repos/ajitkumarsenthil5/citibike_prediciton_aml "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Initialized MLflow to track repo "ajitkumarsenthil5/citibike_prediciton_aml"


INFO:dagshub:Repository ajitkumarsenthil5/citibike_prediciton_aml initialized!
INFO:src.experiment_utils:Dagshub initialized with MLflow integration.
INFO:src.experiment_utils:Using MLflow tracking URI: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow
2025/05/10 13:36:35 INFO mlflow.tracking.fluent: Experiment with name 'RandomRegressor_feature_reduced_without_pca' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: RandomRegressor_feature_reduced_without_pca
INFO:src.experiment_utils:Logged mean_absolute_error: 3.102079300809797
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'PCA_for_trides'.
2025/05/10 13:39:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: PCA_for_trides, version 1
Created version '1' of model 'PCA_for_trides'.
INFO:src.experiment_utils:Model logged and registered as: PCA_for_trides


🏃 View run dapper-goose-526 at: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow/#/experiments/2/runs/443547afcb7e435cb4ce365adede7009
🧪 View experiment at: https://dagshub.com/ajitkumarsenthil5/citibike_prediciton_aml.mlflow/#/experiments/2


<mlflow.models.model.ModelInfo at 0x173778a90>