In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import joblib

import numpy as np
import pandas as pd
import plotly.express as px

from typing import List, Optional

from datetime import date, datetime, timedelta
from pytz import timezone

import optuna
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

import hopsworks
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

In [3]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

In [4]:
import src.config as config

In [5]:
# Connect to Hopsworks
project = hopsworks.login(project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY)

# Get the feature store handle for the project
feature_store = project.get_feature_store()

# Get the feature group
feature_group = feature_store.get_or_create_feature_group(name=config.FEATURE_GROUP_NAME, 
                                                          version=config.FEATURE_GROUP_VERSION,
                                                          description='Timeseries data with hourly frequency', 
                                                          primary_key=['pickup_location_id', 'pickup_ts'], 
                                                          event_time='pickup_hour')

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/699541
Connected. Call `.close()` to terminate connection gracefully.


In [6]:
# Create the feature view
try:
    #
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print(f'Feature View already exists')

# Get the feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, 
    version=config.FEATURE_VIEW_VERSION
    )

Feature View already exists


In [7]:
ts_data, _ = feature_view.training_data(
    description='Timeseries data with hourly frequency',
)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (11.64s) 



In [8]:
# Drop the pickup_ts column
ts_data.drop('pickup_ts', axis=1, inplace=True)

#
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
380997,2023-01-01 00:00:00+00:00,0,1
1958924,2023-01-01 01:00:00+00:00,0,1
2579941,2023-01-01 02:00:00+00:00,0,1
2599272,2023-01-01 03:00:00+00:00,0,1
2954848,2023-01-01 04:00:00+00:00,0,1
...,...,...,...
276766,2024-05-08 19:00:00+00:00,5,265
1870901,2024-05-08 20:00:00+00:00,5,265
1912279,2024-05-08 21:00:00+00:00,6,265
308546,2024-05-08 22:00:00+00:00,4,265


In [9]:
def plot_ts(
        ts_data: pd.DataFrame,
        locations: Optional[List[int]] = None
        ):
    
    '''
    Plot time-series data for the specified locations
    '''

    ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

    fig = px.line(
        ts_data_to_plot,
        x='pickup_hour',
        y='rides',
        color='pickup_location_id',
        template='none'
    )

    fig.show()

In [10]:
plot_ts(ts_data, locations=[265])

In [11]:
from src.data import transform_timeseries_data_into_features_targets

In [12]:
features, targets = transform_timeseries_data_into_features_targets(
    ts_data, 
    input_sequence_length=24*28, # 1 month
    step_size=23)

features_and_targets = features.copy()
features_and_targets['target_rides_next_hour'] = targets

100%|██████████| 265/265 [02:10<00:00,  2.03it/s]


In [13]:
print(f'{features_and_targets.shape=}')

features_and_targets.shape=(129055, 675)


In [14]:
features_and_targets

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-29 00:00:00+00:00,1,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,...,1.0,2.0,1.0,1.0,0.0,0.0,0.0,2023-01-29 23:00:00+00:00,1,0.0
2,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2023-01-30 22:00:00+00:00,1,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,2.0,0.0,0.0,0.0,2023-01-31 21:00:00+00:00,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,1.0,0.0,2.0,1.0,1.0,0.0,2023-02-01 20:00:00+00:00,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129050,1.0,7.0,3.0,5.0,10.0,4.0,6.0,1.0,1.0,2.0,...,1.0,3.0,2.0,4.0,2.0,1.0,4.0,2024-05-04 22:00:00+00:00,265,5.0
129051,3.0,7.0,6.0,4.0,1.0,4.0,0.0,1.0,2.0,2.0,...,1.0,3.0,3.0,3.0,5.0,3.0,1.0,2024-05-05 21:00:00+00:00,265,2.0
129052,2.0,1.0,2.0,3.0,2.0,6.0,0.0,0.0,1.0,0.0,...,0.0,14.0,4.0,3.0,7.0,4.0,5.0,2024-05-06 20:00:00+00:00,265,2.0
129053,1.0,1.0,1.0,1.0,2.0,4.0,3.0,1.0,1.0,0.0,...,7.0,6.0,4.0,12.0,9.0,10.0,11.0,2024-05-07 19:00:00+00:00,265,8.0


In [15]:
from src.data_split import train_test_split

In [16]:
cutoff_date = pd.to_datetime(datetime.utcnow() - timedelta(days=28)) 
cutoff_date = cutoff_date.tz_localize('UTC')
print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_targets, 
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

cutoff_date=Timestamp('2024-04-12 05:08:39.467815+0000', tz='UTC')


In [17]:
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(121635, 674)
y_train.shape=(121635,)
X_test.shape=(7420, 674)
y_test.shape=(7420,)


In [18]:
from src.model import get_pipeline

In [19]:
import warnings

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [20]:
def objective(trial: optuna.trial.Trial) -> float:

    '''
    Define the objective function to optimize the hyperparameters of the model
    Based on a time series split 
    '''

    # Define the hyperparameters to optimize
    hyperparameters = {
        'metric': 'mae',
        'verbose': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 100),
    }

    # Sort the training data by pickup_hour
    # Allows TimeSeriesSplit to split the data in a consistent method
    X_train_sorted = X_train.sort_values(by='pickup_hour').reset_index(drop=True)
    y_train_sorted = y_train.reindex_like(X_train_sorted)

    tss = TimeSeriesSplit(n_splits=2)
    scores = []

    for train_index, val_index in tss.split(X_train):
        # Split the data into training and validation sets
        X_train_, X_val_ = X_train_sorted.iloc[train_index, :], X_train_sorted.iloc[val_index, :]
        y_train_, y_val_ = y_train_sorted.iloc[train_index], y_train_sorted.iloc[val_index]

        # Train the model
        pipeline = get_pipeline(**hyperparameters)
        pipeline.fit(X_train_, y_train_)

        # Evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        scores.append(mae)

    return np.array(scores).mean()

In [21]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-05-10 00:08:41,814] A new study created in memory with name: no-name-19a51735-89a8-4384-890c-1fa06034ddae




[I 2024-05-10 00:09:33,298] Trial 0 finished with value: 27.20853285726467 and parameters: {'num_leaves': 91, 'feature_fraction': 0.7196048758428861, 'bagging_fraction': 0.32723610232358685, 'min_child_samples': 37}. Best is trial 0 with value: 27.20853285726467.




[I 2024-05-10 00:09:55,590] Trial 1 finished with value: 27.369248357597023 and parameters: {'num_leaves': 33, 'feature_fraction': 0.2763185584273332, 'bagging_fraction': 0.4339502826014176, 'min_child_samples': 21}. Best is trial 0 with value: 27.20853285726467.




[I 2024-05-10 00:10:42,739] Trial 2 finished with value: 28.032150440723115 and parameters: {'num_leaves': 228, 'feature_fraction': 0.2249275758904717, 'bagging_fraction': 0.4497286941395879, 'min_child_samples': 9}. Best is trial 0 with value: 27.20853285726467.




[I 2024-05-10 00:11:34,950] Trial 3 finished with value: 27.816608865266012 and parameters: {'num_leaves': 199, 'feature_fraction': 0.4558396640425139, 'bagging_fraction': 0.8770995613349863, 'min_child_samples': 83}. Best is trial 0 with value: 27.20853285726467.




[I 2024-05-10 00:12:06,785] Trial 4 finished with value: 27.050824511187272 and parameters: {'num_leaves': 30, 'feature_fraction': 0.6966355127451305, 'bagging_fraction': 0.9546376341057936, 'min_child_samples': 39}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:13:14,751] Trial 5 finished with value: 27.761034074899886 and parameters: {'num_leaves': 185, 'feature_fraction': 0.866814883705199, 'bagging_fraction': 0.8805917572916904, 'min_child_samples': 40}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:13:53,814] Trial 6 finished with value: 27.45076824424202 and parameters: {'num_leaves': 114, 'feature_fraction': 0.4674501959317791, 'bagging_fraction': 0.8961015525048128, 'min_child_samples': 25}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:15:05,730] Trial 7 finished with value: 27.877407940781747 and parameters: {'num_leaves': 198, 'feature_fraction': 0.9637874060888507, 'bagging_fraction': 0.6381744080360963, 'min_child_samples': 18}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:16:15,256] Trial 8 finished with value: 27.672320226751317 and parameters: {'num_leaves': 180, 'feature_fraction': 0.8000656953029044, 'bagging_fraction': 0.5601203148922349, 'min_child_samples': 27}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:16:54,173] Trial 9 finished with value: 27.25264915610485 and parameters: {'num_leaves': 80, 'feature_fraction': 0.5023209502735595, 'bagging_fraction': 0.7023344635990645, 'min_child_samples': 85}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:17:12,859] Trial 10 finished with value: 27.17635612333425 and parameters: {'num_leaves': 2, 'feature_fraction': 0.6520990488667865, 'bagging_fraction': 0.9882339559086029, 'min_child_samples': 64}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:17:35,806] Trial 11 finished with value: 27.324737071406588 and parameters: {'num_leaves': 4, 'feature_fraction': 0.6578331929073326, 'bagging_fraction': 0.9778762201960115, 'min_child_samples': 62}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:18:08,964] Trial 12 finished with value: 27.224777816053013 and parameters: {'num_leaves': 43, 'feature_fraction': 0.583765891118757, 'bagging_fraction': 0.7734782374957219, 'min_child_samples': 59}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:18:32,994] Trial 13 finished with value: 27.258988530223235 and parameters: {'num_leaves': 4, 'feature_fraction': 0.7368318087008862, 'bagging_fraction': 0.9830380809330197, 'min_child_samples': 69}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:19:11,920] Trial 14 finished with value: 27.167009830371626 and parameters: {'num_leaves': 52, 'feature_fraction': 0.6132844576436366, 'bagging_fraction': 0.7788014484161325, 'min_child_samples': 45}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:19:48,151] Trial 15 finished with value: 27.26982111367117 and parameters: {'num_leaves': 64, 'feature_fraction': 0.5635825635943459, 'bagging_fraction': 0.7880029800999367, 'min_child_samples': 44}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:20:30,111] Trial 16 finished with value: 27.65593568144162 and parameters: {'num_leaves': 135, 'feature_fraction': 0.37339275843412434, 'bagging_fraction': 0.7616653388581033, 'min_child_samples': 49}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:21:23,280] Trial 17 finished with value: 27.55356918925811 and parameters: {'num_leaves': 134, 'feature_fraction': 0.8862615085026028, 'bagging_fraction': 0.5911289201965422, 'min_child_samples': 34}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:21:58,171] Trial 18 finished with value: 27.11981971129523 and parameters: {'num_leaves': 42, 'feature_fraction': 0.7703831070810327, 'bagging_fraction': 0.8734994982215805, 'min_child_samples': 75}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:22:46,437] Trial 19 finished with value: 27.40926969680613 and parameters: {'num_leaves': 103, 'feature_fraction': 0.7843154081455777, 'bagging_fraction': 0.8787435412212713, 'min_child_samples': 77}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:23:20,485] Trial 20 finished with value: 27.25179325124313 and parameters: {'num_leaves': 29, 'feature_fraction': 0.9799154184658131, 'bagging_fraction': 0.6876823786637887, 'min_child_samples': 71}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:24:01,626] Trial 21 finished with value: 27.243190308899436 and parameters: {'num_leaves': 69, 'feature_fraction': 0.6586957743036196, 'bagging_fraction': 0.7910460555004666, 'min_child_samples': 97}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:24:36,433] Trial 22 finished with value: 27.12051235527222 and parameters: {'num_leaves': 48, 'feature_fraction': 0.8470449453730732, 'bagging_fraction': 0.9255834615142853, 'min_child_samples': 54}. Best is trial 4 with value: 27.050824511187272.




[I 2024-05-10 00:25:05,612] Trial 23 finished with value: 26.97135134358193 and parameters: {'num_leaves': 24, 'feature_fraction': 0.8688118773076964, 'bagging_fraction': 0.20009220589739762, 'min_child_samples': 54}. Best is trial 23 with value: 26.97135134358193.




[I 2024-05-10 00:25:35,379] Trial 24 finished with value: 27.19956242702602 and parameters: {'num_leaves': 26, 'feature_fraction': 0.9198220983575367, 'bagging_fraction': 0.24001736464799783, 'min_child_samples': 99}. Best is trial 23 with value: 26.97135134358193.




[I 2024-05-10 00:26:03,938] Trial 25 finished with value: 26.98911732152562 and parameters: {'num_leaves': 21, 'feature_fraction': 0.7501567376388406, 'bagging_fraction': 0.3764414291486636, 'min_child_samples': 55}. Best is trial 23 with value: 26.97135134358193.




[I 2024-05-10 00:26:32,043] Trial 26 finished with value: 26.96451027752412 and parameters: {'num_leaves': 23, 'feature_fraction': 0.6996004185777196, 'bagging_fraction': 0.20246127811359504, 'min_child_samples': 52}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:26:59,417] Trial 27 finished with value: 27.041328682191242 and parameters: {'num_leaves': 18, 'feature_fraction': 0.8281700845228631, 'bagging_fraction': 0.20370442189339374, 'min_child_samples': 56}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:28:15,298] Trial 28 finished with value: 28.117095438185583 and parameters: {'num_leaves': 256, 'feature_fraction': 0.9258171678413272, 'bagging_fraction': 0.30271850768695163, 'min_child_samples': 51}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:28:56,819] Trial 29 finished with value: 27.19525282765025 and parameters: {'num_leaves': 83, 'feature_fraction': 0.7315747259360699, 'bagging_fraction': 0.36983297511070307, 'min_child_samples': 32}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:29:45,645] Trial 30 finished with value: 27.52360471069539 and parameters: {'num_leaves': 150, 'feature_fraction': 0.7284053901117141, 'bagging_fraction': 0.2947935116446667, 'min_child_samples': 47}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:30:10,384] Trial 31 finished with value: 27.13502908230199 and parameters: {'num_leaves': 11, 'feature_fraction': 0.8189881890008165, 'bagging_fraction': 0.202336513350877, 'min_child_samples': 57}. Best is trial 26 with value: 26.96451027752412.




[I 2024-05-10 00:30:38,734] Trial 32 finished with value: 26.95730462174685 and parameters: {'num_leaves': 21, 'feature_fraction': 0.8622913233553622, 'bagging_fraction': 0.38058309732212986, 'min_child_samples': 66}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:31:15,483] Trial 33 finished with value: 27.14285442245689 and parameters: {'num_leaves': 61, 'feature_fraction': 0.8927378115129233, 'bagging_fraction': 0.4401351932004283, 'min_child_samples': 69}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:31:43,999] Trial 34 finished with value: 26.97908931474352 and parameters: {'num_leaves': 26, 'feature_fraction': 0.7666968322470994, 'bagging_fraction': 0.37485530140064793, 'min_child_samples': 64}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:32:15,362] Trial 35 finished with value: 27.243825214735363 and parameters: {'num_leaves': 37, 'feature_fraction': 0.9992936702563844, 'bagging_fraction': 0.4911510704482203, 'min_child_samples': 65}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:32:56,347] Trial 36 finished with value: 27.370367518804898 and parameters: {'num_leaves': 92, 'feature_fraction': 0.6879542951921044, 'bagging_fraction': 0.2728646301547923, 'min_child_samples': 82}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:33:31,387] Trial 37 finished with value: 27.39478907258586 and parameters: {'num_leaves': 58, 'feature_fraction': 0.9289001713143151, 'bagging_fraction': 0.3517425930129181, 'min_child_samples': 39}. Best is trial 32 with value: 26.95730462174685.




[I 2024-05-10 00:33:58,497] Trial 38 finished with value: 26.94863798303215 and parameters: {'num_leaves': 19, 'feature_fraction': 0.8608206393043076, 'bagging_fraction': 0.5153861087184175, 'min_child_samples': 90}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:34:37,847] Trial 39 finished with value: 27.271623413331902 and parameters: {'num_leaves': 75, 'feature_fraction': 0.8622089070590296, 'bagging_fraction': 0.5255064507418766, 'min_child_samples': 89}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:35:03,318] Trial 40 finished with value: 26.987545906237337 and parameters: {'num_leaves': 18, 'feature_fraction': 0.9458460986655031, 'bagging_fraction': 0.25206249350378834, 'min_child_samples': 5}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:35:34,605] Trial 41 finished with value: 27.06112135156414 and parameters: {'num_leaves': 34, 'feature_fraction': 0.7989199181304447, 'bagging_fraction': 0.4824181568714802, 'min_child_samples': 91}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:35:59,322] Trial 42 finished with value: 27.069219471504397 and parameters: {'num_leaves': 14, 'feature_fraction': 0.8617523342706863, 'bagging_fraction': 0.33615981019339325, 'min_child_samples': 77}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:36:28,812] Trial 43 finished with value: 27.098384843345343 and parameters: {'num_leaves': 29, 'feature_fraction': 0.6933603306413922, 'bagging_fraction': 0.4067738885868958, 'min_child_samples': 61}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:37:01,501] Trial 44 finished with value: 27.090919803072943 and parameters: {'num_leaves': 46, 'feature_fraction': 0.8219336212793985, 'bagging_fraction': 0.31497916292654876, 'min_child_samples': 64}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:37:20,432] Trial 45 finished with value: 27.364209523439254 and parameters: {'num_leaves': 3, 'feature_fraction': 0.7710540418565476, 'bagging_fraction': 0.40661532751211443, 'min_child_samples': 42}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:37:43,334] Trial 46 finished with value: 27.97215804553118 and parameters: {'num_leaves': 166, 'feature_fraction': 0.2193869369298509, 'bagging_fraction': 0.2329639942861702, 'min_child_samples': 15}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:38:03,581] Trial 47 finished with value: 27.298224548958927 and parameters: {'num_leaves': 38, 'feature_fraction': 0.3559585615229568, 'bagging_fraction': 0.5529755438256958, 'min_child_samples': 73}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:38:37,763] Trial 48 finished with value: 27.212739963514256 and parameters: {'num_leaves': 54, 'feature_fraction': 0.6248016906480673, 'bagging_fraction': 0.4059345166205804, 'min_child_samples': 66}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:39:23,678] Trial 49 finished with value: 27.560721220861236 and parameters: {'num_leaves': 112, 'feature_fraction': 0.8900756144445856, 'bagging_fraction': 0.4863846841567302, 'min_child_samples': 83}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:39:45,170] Trial 50 finished with value: 27.06856555677863 and parameters: {'num_leaves': 15, 'feature_fraction': 0.5220174081800405, 'bagging_fraction': 0.2803002591813323, 'min_child_samples': 53}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:40:13,610] Trial 51 finished with value: 27.0757949456057 and parameters: {'num_leaves': 22, 'feature_fraction': 0.9593421619945287, 'bagging_fraction': 0.25326802515571784, 'min_child_samples': 59}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:40:37,742] Trial 52 finished with value: 27.07522182321788 and parameters: {'num_leaves': 11, 'feature_fraction': 0.9355497941460135, 'bagging_fraction': 0.22724203159211875, 'min_child_samples': 5}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:41:06,782] Trial 53 finished with value: 26.989953985686647 and parameters: {'num_leaves': 30, 'feature_fraction': 0.89447734161809, 'bagging_fraction': 0.26375587807769874, 'min_child_samples': 35}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:41:25,934] Trial 54 finished with value: 27.365624293448377 and parameters: {'num_leaves': 3, 'feature_fraction': 0.8455088977738126, 'bagging_fraction': 0.3609708888733106, 'min_child_samples': 13}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:41:58,130] Trial 55 finished with value: 27.215444005785415 and parameters: {'num_leaves': 46, 'feature_fraction': 0.9512788238768517, 'bagging_fraction': 0.3254846725186318, 'min_child_samples': 25}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:42:24,847] Trial 56 finished with value: 26.97597675231465 and parameters: {'num_leaves': 21, 'feature_fraction': 0.7652162833598319, 'bagging_fraction': 0.6005078711687917, 'min_child_samples': 48}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:43:21,900] Trial 57 finished with value: 27.789424940932943 and parameters: {'num_leaves': 212, 'feature_fraction': 0.7629776792771636, 'bagging_fraction': 0.5974484632048344, 'min_child_samples': 50}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:43:50,903] Trial 58 finished with value: 27.131162639845456 and parameters: {'num_leaves': 36, 'feature_fraction': 0.7964423546307366, 'bagging_fraction': 0.6726212848446931, 'min_child_samples': 45}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:44:26,670] Trial 59 finished with value: 27.194709864275126 and parameters: {'num_leaves': 70, 'feature_fraction': 0.70739395290593, 'bagging_fraction': 0.5623308984649901, 'min_child_samples': 29}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:44:54,379] Trial 60 finished with value: 27.18286863733443 and parameters: {'num_leaves': 26, 'feature_fraction': 0.6632718288394386, 'bagging_fraction': 0.6277169539035511, 'min_child_samples': 68}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:45:21,164] Trial 61 finished with value: 27.03889004126178 and parameters: {'num_leaves': 16, 'feature_fraction': 0.8247841408595433, 'bagging_fraction': 0.21900973769843282, 'min_child_samples': 79}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:45:46,284] Trial 62 finished with value: 26.960158215622037 and parameters: {'num_leaves': 9, 'feature_fraction': 0.882179097664653, 'bagging_fraction': 0.6348714728205739, 'min_child_samples': 94}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:46:09,858] Trial 63 finished with value: 27.175486688554827 and parameters: {'num_leaves': 8, 'feature_fraction': 0.8707160512099779, 'bagging_fraction': 0.6395174162316801, 'min_child_samples': 95}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:46:38,333] Trial 64 finished with value: 26.997655715220546 and parameters: {'num_leaves': 24, 'feature_fraction': 0.7859018125739557, 'bagging_fraction': 0.731546594178545, 'min_child_samples': 89}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:47:09,899] Trial 65 finished with value: 27.020495165070095 and parameters: {'num_leaves': 41, 'feature_fraction': 0.7469857706917807, 'bagging_fraction': 0.4648474239354975, 'min_child_samples': 94}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:47:43,532] Trial 66 finished with value: 27.351053154393732 and parameters: {'num_leaves': 51, 'feature_fraction': 0.9083161789969914, 'bagging_fraction': 0.5260281088778955, 'min_child_samples': 47}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:48:13,544] Trial 67 finished with value: 27.02809883561674 and parameters: {'num_leaves': 32, 'feature_fraction': 0.8410307014249534, 'bagging_fraction': 0.660436937968951, 'min_child_samples': 60}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:48:36,941] Trial 68 finished with value: 27.128155516006146 and parameters: {'num_leaves': 10, 'feature_fraction': 0.7191695933429504, 'bagging_fraction': 0.7158744497385416, 'min_child_samples': 86}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:49:05,429] Trial 69 finished with value: 27.125475002775083 and parameters: {'num_leaves': 24, 'feature_fraction': 0.9774008190529191, 'bagging_fraction': 0.521655706727016, 'min_child_samples': 100}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:49:22,080] Trial 70 finished with value: 27.179143097628156 and parameters: {'num_leaves': 2, 'feature_fraction': 0.8109409863747027, 'bagging_fraction': 0.6225207404968517, 'min_child_samples': 42}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:49:48,096] Trial 71 finished with value: 27.059357702607922 and parameters: {'num_leaves': 17, 'feature_fraction': 0.874672255081786, 'bagging_fraction': 0.29010343320186927, 'min_child_samples': 57}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:50:14,743] Trial 72 finished with value: 27.028921655692372 and parameters: {'num_leaves': 21, 'feature_fraction': 0.9471372554107789, 'bagging_fraction': 0.25159361784957623, 'min_child_samples': 3}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:50:49,234] Trial 73 finished with value: 27.161893530904393 and parameters: {'num_leaves': 41, 'feature_fraction': 0.9138279222034191, 'bagging_fraction': 0.5794097566990816, 'min_child_samples': 21}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:51:19,097] Trial 74 finished with value: 27.11002775498666 and parameters: {'num_leaves': 32, 'feature_fraction': 0.8468914191932532, 'bagging_fraction': 0.2042372327100073, 'min_child_samples': 52}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:51:42,871] Trial 75 finished with value: 27.025769886638734 and parameters: {'num_leaves': 11, 'feature_fraction': 0.9999698136800126, 'bagging_fraction': 0.38783357676341973, 'min_child_samples': 63}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:52:17,710] Trial 76 finished with value: 27.184622613452643 and parameters: {'num_leaves': 56, 'feature_fraction': 0.6792635403464186, 'bagging_fraction': 0.4535877564918421, 'min_child_samples': 73}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:52:44,124] Trial 77 finished with value: 27.150258593171387 and parameters: {'num_leaves': 17, 'feature_fraction': 0.6313371848794607, 'bagging_fraction': 0.3395717292757748, 'min_child_samples': 80}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:53:06,683] Trial 78 finished with value: 27.29037398883133 and parameters: {'num_leaves': 8, 'feature_fraction': 0.9024521937973814, 'bagging_fraction': 0.4281662007689839, 'min_child_samples': 93}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:53:35,416] Trial 79 finished with value: 27.090789315547923 and parameters: {'num_leaves': 25, 'feature_fraction': 0.7677619546110466, 'bagging_fraction': 0.24365642247201771, 'min_child_samples': 87}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:54:08,501] Trial 80 finished with value: 27.075855147223848 and parameters: {'num_leaves': 48, 'feature_fraction': 0.8621388181175659, 'bagging_fraction': 0.8310408702317822, 'min_child_samples': 48}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:54:35,275] Trial 81 finished with value: 27.027407398367437 and parameters: {'num_leaves': 20, 'feature_fraction': 0.8040311197587298, 'bagging_fraction': 0.39077989859285134, 'min_child_samples': 53}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:55:05,710] Trial 82 finished with value: 27.07904180730504 and parameters: {'num_leaves': 35, 'feature_fraction': 0.7495003614100649, 'bagging_fraction': 0.27164388880525403, 'min_child_samples': 56}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:55:29,836] Trial 83 finished with value: 27.077778306659408 and parameters: {'num_leaves': 28, 'feature_fraction': 0.5682634000535176, 'bagging_fraction': 0.3073960469767994, 'min_child_samples': 67}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:55:55,949] Trial 84 finished with value: 27.077394999244113 and parameters: {'num_leaves': 18, 'feature_fraction': 0.7091153104921731, 'bagging_fraction': 0.3764144908654892, 'min_child_samples': 56}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:56:19,027] Trial 85 finished with value: 27.203135751446265 and parameters: {'num_leaves': 9, 'feature_fraction': 0.7820118646256199, 'bagging_fraction': 0.5107339356114625, 'min_child_samples': 37}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:56:51,041] Trial 86 finished with value: 27.08861541641351 and parameters: {'num_leaves': 41, 'feature_fraction': 0.8336837146953845, 'bagging_fraction': 0.42749086937435216, 'min_child_samples': 63}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:57:07,430] Trial 87 finished with value: 27.18550220093038 and parameters: {'num_leaves': 2, 'feature_fraction': 0.9775818853493429, 'bagging_fraction': 0.2151516228157817, 'min_child_samples': 98}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:57:47,441] Trial 88 finished with value: 27.33473006388975 and parameters: {'num_leaves': 89, 'feature_fraction': 0.7378142905669693, 'bagging_fraction': 0.3417808623781553, 'min_child_samples': 71}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:58:16,446] Trial 89 finished with value: 27.01538393961447 and parameters: {'num_leaves': 29, 'feature_fraction': 0.880135124507468, 'bagging_fraction': 0.5500021519906905, 'min_child_samples': 50}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:58:52,827] Trial 90 finished with value: 27.122102506904817 and parameters: {'num_leaves': 66, 'feature_fraction': 0.670253723353837, 'bagging_fraction': 0.3140107636552203, 'min_child_samples': 59}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:59:22,541] Trial 91 finished with value: 27.160937737349606 and parameters: {'num_leaves': 32, 'feature_fraction': 0.9308099897072067, 'bagging_fraction': 0.26637812155541524, 'min_child_samples': 31}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 00:59:49,707] Trial 92 finished with value: 27.010836584791694 and parameters: {'num_leaves': 22, 'feature_fraction': 0.8985770676357544, 'bagging_fraction': 0.23031777157441405, 'min_child_samples': 37}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:00:15,144] Trial 93 finished with value: 27.01492381979704 and parameters: {'num_leaves': 16, 'feature_fraction': 0.8884673559082322, 'bagging_fraction': 0.6138282199855369, 'min_child_samples': 11}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:00:48,183] Trial 94 finished with value: 26.960223160667475 and parameters: {'num_leaves': 35, 'feature_fraction': 0.8602528362532522, 'bagging_fraction': 0.25869668325198203, 'min_child_samples': 46}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:01:04,381] Trial 95 finished with value: 27.245153814368123 and parameters: {'num_leaves': 8, 'feature_fraction': 0.24418093353854292, 'bagging_fraction': 0.20301556419354203, 'min_child_samples': 41}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:02:09,942] Trial 96 finished with value: 27.96223560075085 and parameters: {'num_leaves': 246, 'feature_fraction': 0.8218262360490962, 'bagging_fraction': 0.646120213390962, 'min_child_samples': 54}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:03:01,694] Trial 97 finished with value: 27.58535890974641 and parameters: {'num_leaves': 146, 'feature_fraction': 0.8490917847126868, 'bagging_fraction': 0.2882308099481158, 'min_child_samples': 49}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:03:32,708] Trial 98 finished with value: 27.111666832028142 and parameters: {'num_leaves': 36, 'feature_fraction': 0.791360609182423, 'bagging_fraction': 0.2515517909908591, 'min_child_samples': 46}. Best is trial 38 with value: 26.94863798303215.




[I 2024-05-10 01:03:57,536] Trial 99 finished with value: 27.112137280251105 and parameters: {'num_leaves': 13, 'feature_fraction': 0.6015977560267526, 'bagging_fraction': 0.57489238898797, 'min_child_samples': 19}. Best is trial 38 with value: 26.94863798303215.


In [22]:
best_parameters = study.best_trial.params
print(f'{best_parameters=}')

best_parameters={'num_leaves': 19, 'feature_fraction': 0.8608206393043076, 'bagging_fraction': 0.5153861087184175, 'min_child_samples': 90}


In [23]:
pipeline = get_pipeline(**best_parameters)
pipeline.fit(X_train, y_train)



In [24]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae:.2f}')

2.89


In [25]:
from src.paths import MODELS_DIR

In [26]:
joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/cmartinez/Documents/08_projects/taxi_demand_prediction_solution/models/model.pkl']

In [27]:
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [28]:
model_registry = project.get_model_registry()

Connected. Call `.close()` to terminate connection gracefully.


In [29]:
model = model_registry.sklearn.create_model(
    name='taxi_demand_prediction',
    metrics={'test_mae': test_mae},
    description='LightGBM regression model with hyperparameter tuning to predict the number of rides in the next hour',
    input_example=X_train.sample(),
    model_schema=model_schema
)

In [31]:
model.save(str(MODELS_DIR / 'model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/239295 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/4461 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/58136 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/699541/models/taxi_demand_prediction/1


Model(name: 'taxi_demand_prediction', version: 1)