In [2]:
import warnings

# disable warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
import os
import pandas as pd

# Get the absolute path to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
sys.path.append(project_root)  # Add the project root to the Python path

# Print sys.path to debug
print(sys.path)

# Verify the Python executable being used
print(sys.executable)

# Now you can import your custom modules
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

['/Users/eugene/Github/taxi_demand_predictor/notebooks', '/Users/eugene/opt/anaconda3/lib/python39.zip', '/Users/eugene/opt/anaconda3/lib/python3.9', '/Users/eugene/opt/anaconda3/lib/python3.9/lib-dynload', '', '/Users/eugene/.local/lib/python3.9/site-packages', '/Users/eugene/opt/anaconda3/lib/python3.9/site-packages', '/Users/eugene/opt/anaconda3/lib/python3.9/site-packages/aeosa', '/Users/eugene/Github/taxi_demand_predictor']
/Users/eugene/opt/anaconda3/bin/python


Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-27,199,0.0
88290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-28,199,0.0
88291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-29,199,0.0
88292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-30,199,0.0


In [4]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 30, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(39824, 674)
y_train.shape=(39824,)
X_test.shape=(48470, 674)
y_test.shape=(48470,)


In [5]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import random

from src.model import get_pipeline

def objective() -> float:
    """
    This function trains a model and computes an average validation error based on KFold cross-validation
    using a manually defined random search for hyperparameters.
    """
    # Manually define hyperparameter search space
    num_leaves = random.randint(2, 256)
    feature_fraction = random.uniform(0.2, 1.0)
    bagging_fraction = random.uniform(0.2, 1.0)
    min_child_samples = random.randint(3, 100)

    # Define hyperparameters
    hyperparams = {
        'metrics': 'mae',
        'verbose': -1,
        'num_leaves': num_leaves,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'min_child_samples': min_child_samples
    }

    print(f"Evaluating hyperparameters: {hyperparams}")

    tss = KFold(n_splits=3)
    scores = []

    for train_index, val_index in tss.split(X_train):

        # Split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # Evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    # Return the mean score and hyperparameters
    return np.array(scores).mean(), hyperparams

def random_search(num_trials=10):
    """
    This function runs the objective function multiple times and prints the best result.
    """
    best_score = float('inf')
    best_hyperparams = None

    for _ in range(num_trials):
        mean_score, hyperparams = objective()
        print(f'Mean MAE: {mean_score}')

        if mean_score < best_score:
            best_score = mean_score
            best_hyperparams = hyperparams

    return best_score, best_hyperparams

In [6]:
# Run the random search
best_score, best_params = random_search(num_trials=10)
print(f'Best MAE: {best_score}')
print(f'Best Hyperparameters: {best_params}')

Evaluating hyperparameters: {'metrics': 'mae', 'verbose': -1, 'num_leaves': 185, 'feature_fraction': 0.6931880883043648, 'bagging_fraction': 0.7092586523475284, 'min_child_samples': 32}
Mean MAE: 3.4844169514254126
Evaluating hyperparameters: {'metrics': 'mae', 'verbose': -1, 'num_leaves': 229, 'feature_fraction': 0.8247776203571695, 'bagging_fraction': 0.9275862372964114, 'min_child_samples': 100}
Mean MAE: 3.7512649615968865
Evaluating hyperparameters: {'metrics': 'mae', 'verbose': -1, 'num_leaves': 175, 'feature_fraction': 0.8326938551178935, 'bagging_fraction': 0.30779713014254795, 'min_child_samples': 10}
Mean MAE: 3.1764913072928747
Evaluating hyperparameters: {'metrics': 'mae', 'verbose': -1, 'num_leaves': 96, 'feature_fraction': 0.9502953489141897, 'bagging_fraction': 0.8972146631903171, 'min_child_samples': 42}
Mean MAE: 3.4018298794299278
Evaluating hyperparameters: {'metrics': 'mae', 'verbose': -1, 'num_leaves': 98, 'feature_fraction': 0.8422328801367645, 'bagging_fraction':

In [7]:

pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [8]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.7973


In [9]:
from src.plot import plot_one_sample

plot_one_sample(
    example_id=100,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [10]:
plot_one_sample(
    example_id=250,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)