In [1]:
# Enable autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
import src.config as config

In [4]:
import hopsworks

In [5]:
# connect to project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/837781
Connected. Call `.close()` to terminate connection gracefully.


In [6]:
# create feature view if it doesn't exist yet
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


In [7]:
ts_date, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (410.10s) 



In [8]:
ts_date.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_date

Unnamed: 0,pickup_hour,rides,pickup_location_id
4990547,2022-01-01 00:00:00+00:00,0,1
1221545,2022-01-01 01:00:00+00:00,0,1
4477920,2022-01-01 02:00:00+00:00,0,1
3300579,2022-01-01 03:00:00+00:00,0,1
2797477,2022-01-01 04:00:00+00:00,1,1
...,...,...,...
2682341,2024-04-30 19:00:00+00:00,1,265
2508614,2024-04-30 20:00:00+00:00,1,265
682208,2024-04-30 21:00:00+00:00,2,265
4517952,2024-04-30 22:00:00+00:00,5,265


In [9]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_date,
    input_seq_len=24*28, # one month
    step_size=23
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

  0%|          | 0/246 [00:00<?, ?it/s]




  0%|          | 1/246 [00:00<03:20,  1.22it/s]




  1%|          | 2/246 [00:01<02:44,  1.49it/s]




  1%|          | 3/246 [00:02<02:37,  1.55it/s]




  2%|▏         | 4/246 [00:02<02:32,  1.59it/s]




  2%|▏         | 5/246 [00:03<02:32,  1.58it/s]




  2%|▏         | 6/246 [00:03<02:28,  1.61it/s]




  3%|▎         | 7/246 [00:04<02:27,  1.62it/s]




  3%|▎         | 8/246 [00:05<02:27,  1.61it/s]




  4%|▎         | 9/246 [00:06<02:52,  1.38it/s]




  4%|▍         | 10/246 [00:06<03:00,  1.31it/s]




  4%|▍         | 11/246 [00:07<03:08,  1.24it/s]




  5%|▍         | 12/246 [00:08<03:24,  1.14it/s]




  5%|▌         | 13/246 [00:09<03:21,  1.15it/s]




  6%|▌         | 14/246 [00:10<03:12,  1.21it/s]




  6%|▌         | 15/246 [00:11<03:19,  1.16it/s]




  7%|▋         | 16/246 [00:12<03:05,  1.24it/s]




  7%|▋         | 17/246 [00:12<02:54,  1.32it/s]




  7%|▋         | 18/246 [00:13<02:49,  1.34it/s]




  8%|▊         | 19/246 [00:14<03:01,  1.25it/s]




  8%|▊         | 20/246 [00:15<03:03,  1.23it/s]




  9%|▊         | 21/246 [00:15<02:52,  1.30it/s]




  9%|▉         | 22/246 [00:16<02:43,  1.37it/s]




  9%|▉         | 23/246 [00:17<02:37,  1.42it/s]




 10%|▉         | 24/246 [00:17<02:34,  1.44it/s]




 10%|█         | 25/246 [00:18<02:30,  1.46it/s]




 11%|█         | 26/246 [00:19<02:32,  1.44it/s]




 11%|█         | 27/246 [00:19<02:30,  1.46it/s]




 11%|█▏        | 28/246 [00:20<02:27,  1.48it/s]




 12%|█▏        | 29/246 [00:21<02:25,  1.50it/s]




 12%|█▏        | 30/246 [00:21<02:32,  1.41it/s]




 13%|█▎        | 31/246 [00:22<02:54,  1.23it/s]




 13%|█▎        | 32/246 [00:23<02:50,  1.26it/s]




 13%|█▎        | 33/246 [00:24<02:50,  1.25it/s]




 14%|█▍        | 34/246 [00:25<02:46,  1.28it/s]




 14%|█▍        | 35/246 [00:26<02:46,  1.27it/s]




 15%|█▍        | 36/246 [00:26<02:42,  1.29it/s]




 15%|█▌        | 37/246 [00:27<02:50,  1.22it/s]




 15%|█▌        | 38/246 [00:28<02:44,  1.26it/s]




 16%|█▌        | 39/246 [00:29<02:43,  1.27it/s]




 16%|█▋        | 40/246 [00:29<02:38,  1.30it/s]




 17%|█▋        | 41/246 [00:30<02:33,  1.34it/s]




 17%|█▋        | 42/246 [00:31<02:40,  1.27it/s]




 17%|█▋        | 43/246 [00:32<02:41,  1.26it/s]




 18%|█▊        | 44/246 [00:33<02:36,  1.29it/s]




 18%|█▊        | 45/246 [00:33<02:38,  1.27it/s]




 19%|█▊        | 46/246 [00:34<02:37,  1.27it/s]




 19%|█▉        | 47/246 [00:35<02:34,  1.29it/s]




 20%|█▉        | 48/246 [00:36<02:32,  1.30it/s]




 20%|█▉        | 49/246 [00:37<02:36,  1.26it/s]




 20%|██        | 50/246 [00:37<02:30,  1.30it/s]




 21%|██        | 51/246 [00:38<02:26,  1.33it/s]




 21%|██        | 52/246 [00:39<02:24,  1.35it/s]




 22%|██▏       | 53/246 [00:39<02:23,  1.34it/s]




 22%|██▏       | 54/246 [00:40<02:21,  1.36it/s]




 22%|██▏       | 55/246 [00:41<02:19,  1.37it/s]




 23%|██▎       | 56/246 [00:42<02:19,  1.36it/s]




 23%|██▎       | 57/246 [00:42<02:18,  1.36it/s]




 24%|██▎       | 58/246 [00:43<02:17,  1.37it/s]




 24%|██▍       | 59/246 [00:44<02:17,  1.36it/s]




 24%|██▍       | 60/246 [00:45<02:16,  1.37it/s]




 25%|██▍       | 61/246 [00:45<02:15,  1.37it/s]




 25%|██▌       | 62/246 [00:46<02:14,  1.37it/s]




 26%|██▌       | 63/246 [00:47<02:16,  1.35it/s]




 26%|██▌       | 64/246 [00:48<02:14,  1.35it/s]




 26%|██▋       | 65/246 [00:48<02:13,  1.35it/s]




 27%|██▋       | 66/246 [00:49<02:14,  1.34it/s]




 27%|██▋       | 67/246 [00:50<02:13,  1.34it/s]




 28%|██▊       | 68/246 [00:51<02:12,  1.35it/s]




 28%|██▊       | 69/246 [00:51<02:12,  1.33it/s]




 28%|██▊       | 70/246 [00:52<02:12,  1.33it/s]




 29%|██▉       | 71/246 [00:53<02:14,  1.30it/s]




 29%|██▉       | 72/246 [00:54<02:13,  1.30it/s]




 30%|██▉       | 73/246 [00:54<02:11,  1.31it/s]




 30%|███       | 74/246 [00:55<02:10,  1.32it/s]




 30%|███       | 75/246 [00:56<02:08,  1.33it/s]




 31%|███       | 76/246 [00:57<02:08,  1.32it/s]




 31%|███▏      | 77/246 [00:57<02:07,  1.32it/s]




 32%|███▏      | 78/246 [00:58<02:09,  1.30it/s]




 32%|███▏      | 79/246 [01:00<02:40,  1.04it/s]




 33%|███▎      | 80/246 [01:01<02:52,  1.04s/it]




 33%|███▎      | 81/246 [01:02<02:55,  1.06s/it]




 33%|███▎      | 82/246 [01:03<02:43,  1.00it/s]




 34%|███▎      | 83/246 [01:04<02:41,  1.01it/s]




 34%|███▍      | 84/246 [01:05<02:30,  1.08it/s]




 35%|███▍      | 85/246 [01:05<02:23,  1.12it/s]




 35%|███▍      | 86/246 [01:06<02:18,  1.15it/s]




 35%|███▌      | 87/246 [01:07<02:16,  1.17it/s]




 36%|███▌      | 88/246 [01:08<02:12,  1.19it/s]




 36%|███▌      | 89/246 [01:09<02:12,  1.18it/s]




 37%|███▋      | 90/246 [01:10<02:14,  1.16it/s]




 37%|███▋      | 91/246 [01:10<02:10,  1.18it/s]




 37%|███▋      | 92/246 [01:11<02:11,  1.17it/s]




 38%|███▊      | 93/246 [01:12<02:10,  1.18it/s]




 38%|███▊      | 94/246 [01:13<02:07,  1.19it/s]




 39%|███▊      | 95/246 [01:14<02:06,  1.19it/s]




 39%|███▉      | 96/246 [01:15<02:04,  1.20it/s]




 39%|███▉      | 97/246 [01:15<02:03,  1.21it/s]




 40%|███▉      | 98/246 [01:16<02:08,  1.15it/s]




 40%|████      | 99/246 [01:17<02:07,  1.16it/s]




 41%|████      | 100/246 [01:18<02:04,  1.17it/s]




 41%|████      | 101/246 [01:19<02:03,  1.18it/s]




 41%|████▏     | 102/246 [01:20<02:01,  1.19it/s]




 42%|████▏     | 103/246 [01:20<02:00,  1.19it/s]




 42%|████▏     | 104/246 [01:21<01:59,  1.19it/s]




 43%|████▎     | 105/246 [01:22<01:57,  1.20it/s]




 43%|████▎     | 106/246 [01:23<02:01,  1.15it/s]




 43%|████▎     | 107/246 [01:24<01:59,  1.16it/s]




 44%|████▍     | 108/246 [01:25<01:59,  1.16it/s]




 44%|████▍     | 109/246 [01:26<01:58,  1.16it/s]




 45%|████▍     | 110/246 [01:27<01:58,  1.15it/s]




 45%|████▌     | 111/246 [01:27<01:56,  1.16it/s]




 46%|████▌     | 112/246 [01:28<01:56,  1.15it/s]




 46%|████▌     | 113/246 [01:29<01:55,  1.15it/s]




 46%|████▋     | 114/246 [01:30<01:57,  1.12it/s]




 47%|████▋     | 115/246 [01:31<01:58,  1.11it/s]




 47%|████▋     | 116/246 [01:32<01:56,  1.11it/s]




 48%|████▊     | 117/246 [01:33<01:54,  1.12it/s]




 48%|████▊     | 118/246 [01:34<01:53,  1.13it/s]




 48%|████▊     | 119/246 [01:35<01:51,  1.14it/s]




 49%|████▉     | 120/246 [01:35<01:51,  1.13it/s]




 49%|████▉     | 121/246 [01:36<01:53,  1.10it/s]




 50%|████▉     | 122/246 [01:37<01:50,  1.12it/s]




 50%|█████     | 123/246 [01:38<01:50,  1.11it/s]




 50%|█████     | 124/246 [01:39<01:48,  1.12it/s]




 51%|█████     | 125/246 [01:40<01:48,  1.12it/s]




 51%|█████     | 126/246 [01:41<01:46,  1.13it/s]




 52%|█████▏    | 127/246 [01:42<01:45,  1.13it/s]




 52%|█████▏    | 128/246 [01:43<01:45,  1.12it/s]




 52%|█████▏    | 129/246 [01:44<01:47,  1.09it/s]




 53%|█████▎    | 130/246 [01:44<01:46,  1.09it/s]




 53%|█████▎    | 131/246 [01:45<01:45,  1.09it/s]




 54%|█████▎    | 132/246 [01:46<01:43,  1.10it/s]




 54%|█████▍    | 133/246 [01:47<01:43,  1.09it/s]




 54%|█████▍    | 134/246 [01:48<01:41,  1.11it/s]




 55%|█████▍    | 135/246 [01:49<01:40,  1.11it/s]




 55%|█████▌    | 136/246 [01:50<01:39,  1.10it/s]




 56%|█████▌    | 137/246 [01:51<01:46,  1.02it/s]




 56%|█████▌    | 138/246 [01:52<01:43,  1.04it/s]




 57%|█████▋    | 139/246 [01:53<01:41,  1.06it/s]




 57%|█████▋    | 140/246 [01:54<01:39,  1.07it/s]




 57%|█████▋    | 141/246 [01:55<01:38,  1.07it/s]




 58%|█████▊    | 142/246 [01:56<01:36,  1.08it/s]




 58%|█████▊    | 143/246 [01:57<01:37,  1.05it/s]




 59%|█████▊    | 144/246 [01:58<01:36,  1.06it/s]




 59%|█████▉    | 145/246 [01:58<01:30,  1.12it/s]




 59%|█████▉    | 146/246 [01:59<01:30,  1.10it/s]




 60%|█████▉    | 147/246 [02:00<01:30,  1.10it/s]




 60%|██████    | 148/246 [02:01<01:29,  1.10it/s]




 61%|██████    | 149/246 [02:02<01:29,  1.08it/s]




 61%|██████    | 150/246 [02:03<01:28,  1.08it/s]




 61%|██████▏   | 151/246 [02:04<01:30,  1.05it/s]




 62%|██████▏   | 152/246 [02:05<01:28,  1.07it/s]




 62%|██████▏   | 153/246 [02:06<01:26,  1.07it/s]




 63%|██████▎   | 154/246 [02:07<01:26,  1.07it/s]




 63%|██████▎   | 155/246 [02:08<01:26,  1.06it/s]




 63%|██████▎   | 156/246 [02:09<01:24,  1.06it/s]




 64%|██████▍   | 157/246 [02:10<01:24,  1.06it/s]




 64%|██████▍   | 158/246 [02:11<01:25,  1.03it/s]




 65%|██████▍   | 159/246 [02:12<01:23,  1.04it/s]




 65%|██████▌   | 160/246 [02:13<01:21,  1.05it/s]




 65%|██████▌   | 161/246 [02:14<01:21,  1.04it/s]




 66%|██████▌   | 162/246 [02:14<01:20,  1.04it/s]




 66%|██████▋   | 163/246 [02:15<01:19,  1.05it/s]




 67%|██████▋   | 164/246 [02:16<01:20,  1.02it/s]




 67%|██████▋   | 165/246 [02:17<01:18,  1.03it/s]




 67%|██████▋   | 166/246 [02:18<01:20,  1.00s/it]




 68%|██████▊   | 167/246 [02:19<01:18,  1.01it/s]




 68%|██████▊   | 168/246 [02:20<01:16,  1.03it/s]




 69%|██████▊   | 169/246 [02:21<01:14,  1.03it/s]




 69%|██████▉   | 170/246 [02:22<01:13,  1.04it/s]




 70%|██████▉   | 171/246 [02:23<01:11,  1.05it/s]




 70%|██████▉   | 172/246 [02:24<01:11,  1.04it/s]




 70%|███████   | 173/246 [02:25<01:11,  1.02it/s]




 71%|███████   | 174/246 [02:26<01:10,  1.02it/s]




 71%|███████   | 175/246 [02:27<01:08,  1.04it/s]




 72%|███████▏  | 176/246 [02:28<01:07,  1.04it/s]




 72%|███████▏  | 177/246 [02:29<01:07,  1.03it/s]




 72%|███████▏  | 178/246 [02:30<01:05,  1.04it/s]




 73%|███████▎  | 179/246 [02:31<01:04,  1.03it/s]




 73%|███████▎  | 180/246 [02:32<01:03,  1.04it/s]




 74%|███████▎  | 181/246 [02:33<01:04,  1.00it/s]




 74%|███████▍  | 182/246 [02:34<01:02,  1.02it/s]




 74%|███████▍  | 183/246 [02:35<01:01,  1.03it/s]




 75%|███████▍  | 184/246 [02:36<01:00,  1.02it/s]




 75%|███████▌  | 185/246 [02:37<01:07,  1.11s/it]




 76%|███████▌  | 186/246 [02:39<01:09,  1.15s/it]




 76%|███████▌  | 187/246 [02:40<01:05,  1.11s/it]




 76%|███████▋  | 188/246 [02:41<01:02,  1.08s/it]




 77%|███████▋  | 189/246 [02:42<00:59,  1.05s/it]




 77%|███████▋  | 190/246 [02:43<00:57,  1.02s/it]




 78%|███████▊  | 191/246 [02:44<00:55,  1.01s/it]




 78%|███████▊  | 192/246 [02:45<00:56,  1.04s/it]




 78%|███████▊  | 193/246 [02:46<00:54,  1.03s/it]




 79%|███████▉  | 194/246 [02:47<00:55,  1.06s/it]




 79%|███████▉  | 195/246 [02:48<00:59,  1.17s/it]




 80%|███████▉  | 196/246 [02:49<00:56,  1.13s/it]




 80%|████████  | 197/246 [02:51<00:58,  1.18s/it]




 80%|████████  | 198/246 [02:52<00:55,  1.15s/it]




 81%|████████  | 199/246 [02:53<00:54,  1.15s/it]




 81%|████████▏ | 200/246 [02:54<00:51,  1.13s/it]




 82%|████████▏ | 201/246 [02:55<00:50,  1.11s/it]




 82%|████████▏ | 202/246 [02:56<00:48,  1.09s/it]




 83%|████████▎ | 203/246 [02:57<00:46,  1.08s/it]




 83%|████████▎ | 204/246 [02:58<00:45,  1.09s/it]




 83%|████████▎ | 205/246 [02:59<00:44,  1.09s/it]




 84%|████████▎ | 206/246 [03:00<00:45,  1.13s/it]




 84%|████████▍ | 207/246 [03:02<00:44,  1.14s/it]




 85%|████████▍ | 208/246 [03:03<00:42,  1.13s/it]




 85%|████████▍ | 209/246 [03:04<00:41,  1.13s/it]




 85%|████████▌ | 210/246 [03:05<00:40,  1.12s/it]




 86%|████████▌ | 211/246 [03:06<00:38,  1.11s/it]




 86%|████████▌ | 212/246 [03:07<00:37,  1.09s/it]




 87%|████████▋ | 213/246 [03:08<00:35,  1.09s/it]




 87%|████████▋ | 214/246 [03:09<00:35,  1.10s/it]




 87%|████████▋ | 215/246 [03:10<00:33,  1.07s/it]




 88%|████████▊ | 216/246 [03:11<00:32,  1.07s/it]




 88%|████████▊ | 217/246 [03:14<00:41,  1.42s/it]




 89%|████████▊ | 218/246 [03:15<00:37,  1.35s/it]




 89%|████████▉ | 219/246 [03:16<00:34,  1.27s/it]




 89%|████████▉ | 220/246 [03:17<00:32,  1.26s/it]




 90%|████████▉ | 221/246 [03:19<00:34,  1.40s/it]




 90%|█████████ | 222/246 [03:20<00:32,  1.34s/it]




 91%|█████████ | 223/246 [03:21<00:29,  1.27s/it]




 91%|█████████ | 224/246 [03:22<00:27,  1.24s/it]




 91%|█████████▏| 225/246 [03:24<00:27,  1.29s/it]




 92%|█████████▏| 226/246 [03:25<00:26,  1.33s/it]




 92%|█████████▏| 227/246 [03:26<00:25,  1.32s/it]




 93%|█████████▎| 228/246 [03:28<00:24,  1.34s/it]




 93%|█████████▎| 229/246 [03:29<00:22,  1.31s/it]




 93%|█████████▎| 230/246 [03:31<00:25,  1.61s/it]




 94%|█████████▍| 231/246 [03:33<00:24,  1.66s/it]




 94%|█████████▍| 232/246 [03:34<00:21,  1.56s/it]




 95%|█████████▍| 233/246 [03:36<00:18,  1.46s/it]




 95%|█████████▌| 234/246 [03:37<00:17,  1.43s/it]




 96%|█████████▌| 235/246 [03:38<00:15,  1.37s/it]




 96%|█████████▌| 236/246 [03:40<00:13,  1.32s/it]




 96%|█████████▋| 237/246 [03:41<00:13,  1.50s/it]




 97%|█████████▋| 238/246 [03:43<00:11,  1.48s/it]




 97%|█████████▋| 239/246 [03:44<00:10,  1.43s/it]




 98%|█████████▊| 240/246 [03:45<00:08,  1.38s/it]




 98%|█████████▊| 241/246 [03:47<00:06,  1.36s/it]




 98%|█████████▊| 242/246 [03:48<00:05,  1.29s/it]




 99%|█████████▉| 243/246 [03:49<00:03,  1.25s/it]




 99%|█████████▉| 244/246 [03:50<00:02,  1.23s/it]




100%|█████████▉| 245/246 [03:51<00:01,  1.23s/it]




100%|██████████| 246/246 [03:53<00:00,  1.06it/s]


features_and_target.shape=(210258, 675)


In [14]:
from datetime import date, datetime, timedelta
import pytz
import pandas as pd
from src.data_split import train_test_split

# training data -> from Jan 2022 till 2 months ago
# test data -> last 2 months
utc_zone = pytz.utc
cutoff_date = datetime.now(utc_zone) - timedelta(days=28*3) # 3 months
cutoff_date = pd.to_datetime(cutoff_date).replace(tzinfo=None)

print(f'{cutoff_date=}')

features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])
features_and_target['pickup_hour'] = features_and_target['pickup_hour'].dt.tz_localize(None)

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2024-04-10 18:33:45.362788')
X_train.shape=(205125, 674)
y_train.shape=(205125,)
X_test.shape=(5133, 674)
y_test.shape=(5133,)


In [19]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    
    # sort X_train by `pikup_hour` inplace
    # so the TimeSeriesSplit will split the data in a consistent way
    X_train.sort_values('pickup_hour', inplace=True)

    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [24]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[I 2024-07-04 02:47:57,618] A new study created in memory with name: no-name-58bbad0c-d44a-4d96-b5b2-d7aaf1616ffe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:48:57,951] Trial 0 finished with value: 31.25755832747671 and parameters: {'num_leaves': 206, 'feature_fraction': 0.503843225513062, 'bagging_fraction': 0.4274938136706287, 'min_child_samples': 57}. Best is trial 0 with value: 31.25755832747671.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:49:40,433] Trial 1 finished with value: 30.33356232066064 and parameters: {'num_leaves': 51, 'feature_fraction': 0.8221580914399422, 'bagging_fraction': 0.872869572561856, 'min_child_samples': 5}. Best is trial 1 with value: 30.33356232066064.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:50:40,491] Trial 2 finished with value: 30.812542868363423 and parameters: {'num_leaves': 136, 'feature_fraction': 0.8520494083904135, 'bagging_fraction': 0.3658731234111312, 'min_child_samples': 8}. Best is trial 1 with value: 30.33356232066064.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:51:14,662] Trial 3 finished with value: 29.624553966564946 and parameters: {'num_leaves': 10, 'feature_fraction': 0.5618137623632447, 'bagging_fraction': 0.3612311130107352, 'min_child_samples': 6}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:52:25,064] Trial 4 finished with value: 31.08717249854095 and parameters: {'num_leaves': 165, 'feature_fraction': 0.5244700167020164, 'bagging_fraction': 0.8214870263322527, 'min_child_samples': 57}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:53:16,163] Trial 5 finished with value: 30.688360764014618 and parameters: {'num_leaves': 127, 'feature_fraction': 0.6517971084469757, 'bagging_fraction': 0.2651375528917216, 'min_child_samples': 7}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:54:20,950] Trial 6 finished with value: 30.81420133328175 and parameters: {'num_leaves': 160, 'feature_fraction': 0.4498400719388017, 'bagging_fraction': 0.20919524126258696, 'min_child_samples': 32}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:54:44,335] Trial 7 finished with value: 29.744679285511094 and parameters: {'num_leaves': 11, 'feature_fraction': 0.6168327163876273, 'bagging_fraction': 0.82815843414411, 'min_child_samples': 94}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:55:25,456] Trial 8 finished with value: 30.71209415184438 and parameters: {'num_leaves': 133, 'feature_fraction': 0.6064803263016537, 'bagging_fraction': 0.6113568693139948, 'min_child_samples': 14}. Best is trial 3 with value: 29.624553966564946.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-07-04 02:56:31,032] Trial 9 finished with value: 31.250736335854455 and parameters: {'num_leaves': 220, 'feature_fraction': 0.7792931280591351, 'bagging_fraction': 0.6431713610662572, 'min_child_samples': 26}. Best is trial 3 with value: 29.624553966564946.


In [25]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 10, 'feature_fraction': 0.5618137623632447, 'bagging_fraction': 0.3612311130107352, 'min_child_samples': 6}


In [26]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.142067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171893
[LightGBM] [Info] Number of data points in the train set: 205125, number of used features: 676
[LightGBM] [Info] Start training from score 18.263576


In [27]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=33.1820


In [28]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/eugene/Github/taxi_demand_predictor/models/model.pkl']

In [29]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [31]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR / 'model.pkl'))

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/162929 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3394 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/58131 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/837781/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)