# Pickup to Delivery Overall

In [81]:
import os
import sys
import datetime
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import numpy as np
import pickle

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils
from estimator import BaselineModel_sum, BaselineModel_mean, LinearModel_encode_timestamps_dummy_variables, LinearModelSGD_encode_timestamps_dummy_variables, LinearModel_encode_timestamps_cyclical

In [2]:
base_query_path = './queries/'
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
datalake_connection = qr.create_connection(db='datalake')
#monolith_connection = qr.create_connection(user=livedb_config['monolith_username'], password=livedb_config['monolith_password'], db='livedb')
#dispatching_db_connection = qr.create_connection(user=livedb_config['dispatching_db_username'], password=livedb_config['dispatching_db_password'], db='dispatchingdb')

INFO:trino.auth:keyring module not found. OAuth2 token will not be stored in keyring.


In [3]:
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']
cities = parameters_config['cities']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code} | Cities: {cities}')

Start date: 2024-09-30 | End date: 2024-10-20 | Countries: ES | Cities: 'MAD', 'BCN', 'SEV', 'ALC'


In [4]:
parameters = {
    'start_date': start_date,
    'end_date': end_date,
    'country_code': country_code,
    'cities': cities
}

## Load the dataset

In [5]:
data = pd.read_parquet("data/parquet/dataframe.parquet")

## Hyperparameters

In [6]:
test_set_perc = 0.1
days_for_test = 7
k_cv = 5

## Database split

As we have partitioned the data by city and creation date, we can use this information to split the data. This will help to avoid data leakage, as we will not have data from the future in the training set.
This is much better than just sorting the data by the creation timestamp and taking 10% of the dataset as test set, as we did before.

In [7]:
# We take the last week of the dataset to test the model
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | Begin test date: 2024-10-14 | End date: 2024-10-20


In [8]:
X_train = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', begin_test_date)])
X_train.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100901000206,169065712,2024-09-30 19:00:49+00:00,2024-09-30 19:00:52+00:00,CAR,2024-09-30 19:20:41.337000+00:00,2024-09-30 19:30:56+00:00,2024-09-30 19:38:11+00:00,38.370686,...,Europe/Madrid,0.669695,-0.008135,0.669284,-0.008486,3147.419444,3147.419444,4371.550539,2024-09-30,ALC
1,ES,100901487351,173633632,2024-09-30 21:54:58+00:00,2024-09-30 21:54:59+00:00,MOTORBIKE,2024-09-30 21:58:40.678000+00:00,2024-09-30 22:12:29+00:00,2024-09-30 22:16:54+00:00,38.342865,...,Europe/Madrid,0.669209,-0.008538,0.669514,-0.008481,1959.901021,1959.901021,2228.776817,2024-09-30,ALC
2,ES,100901615508,2320936,2024-09-30 23:17:55+00:00,2024-09-30 23:17:55+00:00,CAR,2024-09-30 23:25:52.013000+00:00,2024-09-30 23:34:12+00:00,2024-09-30 23:35:27+00:00,38.425022,...,Europe/Madrid,0.670643,-0.007077,0.67032,-0.007526,3043.077176,3043.077176,4304.543962,2024-09-30,ALC
3,ES,100900153723,174681565,2024-09-30 12:50:30+00:00,2024-09-30 12:50:31+00:00,MOTORBIKE,2024-09-30 12:51:54.134000+00:00,2024-09-30 12:53:27+00:00,2024-09-30 12:56:56+00:00,38.387657,...,Europe/Madrid,0.669991,-0.007173,0.669942,-0.007219,384.769891,384.769891,538.640403,2024-09-30,ALC
4,ES,100900235936,142421923,2024-09-30 13:23:50+00:00,2024-09-30 13:23:51+00:00,CAR,2024-09-30 13:49:36.097000+00:00,2024-09-30 13:57:33+00:00,2024-09-30 14:01:33+00:00,38.408985,...,Europe/Madrid,0.670363,-0.00744,0.669999,-0.007465,2321.097615,2321.097615,2443.128342,2024-09-30,ALC


In [9]:
# Check that there are no nulls deriving from a wrong writing of parquet files (appending instead of overwriting)
X_train.isnull().sum().sum()

np.int64(0)

In [10]:
y_train = X_train['delivery_entering_timestamp'] - X_train['pickup_timestamp']
y_train = pd.Series(y_train, name='pickup_to_delivery')
y_train

0        0 days 00:10:14.663000
1        0 days 00:13:48.322000
2        0 days 00:08:19.987000
3        0 days 00:01:32.866000
4        0 days 00:07:56.903000
                  ...          
678746   0 days 00:03:53.637000
678747   0 days 00:03:57.223000
678748   0 days 00:06:54.631000
678749   0 days 00:23:10.249000
678750   0 days 00:05:43.235000
Name: pickup_to_delivery, Length: 678751, dtype: timedelta64[ns]

In [11]:
y_train.isnull().sum().sum()

np.int64(0)

In [12]:
X_test = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
X_test.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100932167582,10191824,2024-10-14 15:51:51+00:00,2024-10-14 15:51:52+00:00,MOTORBIKE,2024-10-14 16:00:14.017000+00:00,2024-10-14 16:06:55+00:00,2024-10-14 16:08:56+00:00,38.36011,...,Europe/Madrid,0.66951,-0.008658,0.669291,-0.008891,1817.533836,1817.533836,2562.964399,2024-10-14,ALC
1,ES,100932256816,50618208,2024-10-14 16:36:30+00:00,2024-10-14 16:36:31+00:00,CAR,2024-10-14 16:45:53.061000+00:00,2024-10-14 16:53:56+00:00,2024-10-14 16:56:21+00:00,38.354233,...,Europe/Madrid,0.669408,-0.008239,0.66949,-0.008349,757.291206,757.291206,1071.85653,2024-10-14,ALC
2,ES,100932496959,170202435,2024-10-14 18:26:33+00:00,2024-10-14 18:26:34+00:00,MOTORBIKE,2024-10-14 18:35:05.220000+00:00,2024-10-14 18:41:32+00:00,2024-10-14 18:43:20+00:00,38.429874,...,Europe/Madrid,0.670728,-0.006992,0.670951,-0.00657,2541.011986,2541.011986,3531.016499,2024-10-14,ALC
3,ES,100932664161,166731686,2024-10-14 19:30:57+00:00,2024-10-14 19:30:59+00:00,CAR,2024-10-14 19:58:06.210000+00:00,2024-10-14 20:03:31+00:00,2024-10-14 20:04:48+00:00,38.39809,...,Europe/Madrid,0.670173,-0.007554,0.670373,-0.007686,1432.008005,1432.008005,1931.87208,2024-10-14,ALC
4,ES,100932854728,168741567,2024-10-14 20:35:29+00:00,2024-10-14 20:35:30+00:00,CAR,2024-10-14 20:57:11.030000+00:00,2024-10-14 21:00:16+00:00,2024-10-14 21:04:55+00:00,38.355713,...,Europe/Madrid,0.669433,-0.008372,0.669508,-0.008358,477.653498,477.653498,544.529748,2024-10-14,ALC


In [13]:
X_test.isnull().sum().sum()

np.int64(0)

In [14]:
y_test = (X_test['delivery_entering_timestamp'] - X_test['pickup_timestamp']).dt.total_seconds()
y_test = pd.Series(y_test, dtype=np.float64, name='pickup_to_delivery')
y_test

0         400.983
1         482.939
2         386.780
3         324.790
4         184.970
           ...   
349864    632.295
349865    557.103
349866    304.409
349867    522.050
349868    291.687
Name: pickup_to_delivery, Length: 349869, dtype: float64

In [15]:
y_test.isnull().sum().sum()

np.int64(0)

In [16]:
print("Train datasets shapes: ", X_train.shape, y_train.shape)
print("Test datasets shapes: ", X_test.shape, y_test.shape)

Train datasets shapes:  (678751, 23) (678751,)
Test datasets shapes:  (349869, 23) (349869,)


## Baseline Models

### BaselineModel_sum

In [17]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 16:57:26.564021


In [18]:
model_bl_sum = BaselineModel_sum()
model_bl_sum.fit(X_train, y_train)

<estimator.BaselineModel_sum at 0x12733c8c0>

In [19]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_sum_time = end - start
print(f"Time elapsed: {model_bl_sum_time}")

End time: 2025-04-16 16:57:27.142632
Time elapsed: 0:00:00.578611


In [20]:
X_test_expanded = X_test.copy()
X_test_expanded['y_test_predicted'] = model_bl_sum.predict(X_test)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,y_test_predicted,y_test,diff
0,ES,100932167582,10191824,2024-10-14 15:51:51+00:00,2024-10-14 15:51:52+00:00,MOTORBIKE,2024-10-14 16:00:14.017000+00:00,2024-10-14 16:06:55+00:00,2024-10-14 16:08:56+00:00,38.360110,...,0.669291,-0.008891,1817.533836,1817.533836,2562.964399,2024-10-14,ALC,581.137630,400.983,180.154630
1,ES,100932256816,50618208,2024-10-14 16:36:30+00:00,2024-10-14 16:36:31+00:00,CAR,2024-10-14 16:45:53.061000+00:00,2024-10-14 16:53:56+00:00,2024-10-14 16:56:21+00:00,38.354233,...,0.669490,-0.008349,757.291206,757.291206,1071.856530,2024-10-14,ALC,213.693356,482.939,-269.245644
2,ES,100932496959,170202435,2024-10-14 18:26:33+00:00,2024-10-14 18:26:34+00:00,MOTORBIKE,2024-10-14 18:35:05.220000+00:00,2024-10-14 18:41:32+00:00,2024-10-14 18:43:20+00:00,38.429874,...,0.670951,-0.006570,2541.011986,2541.011986,3531.016499,2024-10-14,ALC,812.462279,386.780,425.682279
3,ES,100932664161,166731686,2024-10-14 19:30:57+00:00,2024-10-14 19:30:59+00:00,CAR,2024-10-14 19:58:06.210000+00:00,2024-10-14 20:03:31+00:00,2024-10-14 20:04:48+00:00,38.398090,...,0.670373,-0.007686,1432.008005,1432.008005,1931.872080,2024-10-14,ALC,404.085764,324.790,79.295764
4,ES,100932854728,168741567,2024-10-14 20:35:29+00:00,2024-10-14 20:35:30+00:00,CAR,2024-10-14 20:57:11.030000+00:00,2024-10-14 21:00:16+00:00,2024-10-14 21:04:55+00:00,38.355713,...,0.669508,-0.008358,477.653498,477.653498,544.529748,2024-10-14,ALC,134.784846,184.970,-50.185154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349864,ES,100947015472,176969368,2024-10-21 00:57:33+00:00,2024-10-21 00:57:34+00:00,BICYCLE,2024-10-21 01:12:35.705000+00:00,2024-10-21 01:23:08+00:00,2024-10-21 01:32:38+00:00,37.381660,...,0.652244,-0.104294,2098.386481,2098.386481,2925.829704,2024-10-20,SEV,788.860487,632.295,156.565487
349865,ES,100944621553,159448683,2024-10-20 10:33:25+00:00,2024-10-20 10:33:26+00:00,CAR,2024-10-20 11:01:33.897000+00:00,2024-10-20 11:10:51+00:00,2024-10-20 11:12:43+00:00,37.384180,...,0.652789,-0.104114,2080.630360,2080.630360,2601.943885,2024-10-20,SEV,587.114811,557.103,30.011811
349866,ES,100945007795,145859188,2024-10-20 13:32:20+00:00,2024-10-20 13:32:21+00:00,BICYCLE,2024-10-20 13:39:11.591000+00:00,2024-10-20 13:44:16+00:00,2024-10-20 13:48:51+00:00,37.377640,...,0.652323,-0.104863,622.006487,622.006487,824.278938,2024-10-20,SEV,233.835066,304.409,-70.573934
349867,ES,100945051115,177805460,2024-10-20 13:44:47+00:00,2024-10-20 13:44:49+00:00,MOTORBIKE,2024-10-20 13:53:20.950000+00:00,2024-10-20 14:02:03+00:00,2024-10-20 14:02:46+00:00,37.350040,...,0.652305,-0.105367,2980.407072,2980.407072,3963.162256,2024-10-20,SEV,952.954309,522.050,430.904309


In [21]:
model_bl_sum.predict(X_test.iloc[0])

0    581.13763
Name: pickup_to_delivery_predicted, dtype: float64

### Evaluation pipeline

In [22]:
model_bl_sum_eval = model_bl_sum.evaluate(X_test, y_test)
model_bl_sum_eval

y_hat <class 'pandas.core.series.Series'> float64
y_test <class 'pandas.core.series.Series'> float64


(np.float64(145.6280391037287), np.float64(52902.38788391615))

### BaselineModel_mean

In [23]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 16:58:06.270081


In [24]:
model_bl_mean = BaselineModel_mean()
model_bl_mean.fit(X_train, y_train)

<estimator.BaselineModel_mean at 0x12733d9a0>

In [25]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_mean_time = end - start
print(f"Time elapsed: {model_bl_mean_time}")

End time: 2025-04-16 16:58:06.604539
Time elapsed: 0:00:00.334458


In [26]:
X_test_expanded = X_test.copy()
X_test_expanded['y_test_predicted'] = model_bl_mean.predict(X_test)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,y_test_predicted,y_test,diff
0,ES,100932167582,10191824,2024-10-14 15:51:51+00:00,2024-10-14 15:51:52+00:00,MOTORBIKE,2024-10-14 16:00:14.017000+00:00,2024-10-14 16:06:55+00:00,2024-10-14 16:08:56+00:00,38.360110,...,0.669291,-0.008891,1817.533836,1817.533836,2562.964399,2024-10-14,ALC,408.650082,400.983,7.667082
1,ES,100932256816,50618208,2024-10-14 16:36:30+00:00,2024-10-14 16:36:31+00:00,CAR,2024-10-14 16:45:53.061000+00:00,2024-10-14 16:53:56+00:00,2024-10-14 16:56:21+00:00,38.354233,...,0.669490,-0.008349,757.291206,757.291206,1071.856530,2024-10-14,ALC,305.301592,482.939,-177.637408
2,ES,100932496959,170202435,2024-10-14 18:26:33+00:00,2024-10-14 18:26:34+00:00,MOTORBIKE,2024-10-14 18:35:05.220000+00:00,2024-10-14 18:41:32+00:00,2024-10-14 18:43:20+00:00,38.429874,...,0.670951,-0.006570,2541.011986,2541.011986,3531.016499,2024-10-14,ALC,571.315227,386.780,184.535227
3,ES,100932664161,166731686,2024-10-14 19:30:57+00:00,2024-10-14 19:30:59+00:00,CAR,2024-10-14 19:58:06.210000+00:00,2024-10-14 20:03:31+00:00,2024-10-14 20:04:48+00:00,38.398090,...,0.670373,-0.007686,1432.008005,1432.008005,1931.872080,2024-10-14,ALC,577.313351,324.790,252.523351
4,ES,100932854728,168741567,2024-10-14 20:35:29+00:00,2024-10-14 20:35:30+00:00,CAR,2024-10-14 20:57:11.030000+00:00,2024-10-14 21:00:16+00:00,2024-10-14 21:04:55+00:00,38.355713,...,0.669508,-0.008358,477.653498,477.653498,544.529748,2024-10-14,ALC,192.565782,184.970,7.595782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349864,ES,100947015472,176969368,2024-10-21 00:57:33+00:00,2024-10-21 00:57:34+00:00,BICYCLE,2024-10-21 01:12:35.705000+00:00,2024-10-21 01:23:08+00:00,2024-10-21 01:32:38+00:00,37.381660,...,0.652244,-0.104294,2098.386481,2098.386481,2925.829704,2024-10-20,SEV,689.618639,632.295,57.323639
349865,ES,100944621553,159448683,2024-10-20 10:33:25+00:00,2024-10-20 10:33:26+00:00,CAR,2024-10-20 11:01:33.897000+00:00,2024-10-20 11:10:51+00:00,2024-10-20 11:12:43+00:00,37.384180,...,0.652789,-0.104114,2080.630360,2080.630360,2601.943885,2024-10-20,SEV,838.805147,557.103,281.702147
349866,ES,100945007795,145859188,2024-10-20 13:32:20+00:00,2024-10-20 13:32:21+00:00,BICYCLE,2024-10-20 13:39:11.591000+00:00,2024-10-20 13:44:16+00:00,2024-10-20 13:48:51+00:00,37.377640,...,0.652323,-0.104863,622.006487,622.006487,824.278938,2024-10-20,SEV,204.417666,304.409,-99.991334
349867,ES,100945051115,177805460,2024-10-20 13:44:47+00:00,2024-10-20 13:44:49+00:00,MOTORBIKE,2024-10-20 13:53:20.950000+00:00,2024-10-20 14:02:03+00:00,2024-10-20 14:02:46+00:00,37.350040,...,0.652305,-0.105367,2980.407072,2980.407072,3963.162256,2024-10-20,SEV,670.107797,522.050,148.057797


In [27]:
model_bl_mean.predict(X_test.iloc[0])

0    408.650082
Name: pickup_to_delivery_predicted, dtype: float64

### Evaluation pipeline

In [28]:
model_bl_mean_eval = model_bl_mean.evaluate(X_test, y_test)
model_bl_mean_eval

(np.float64(161.23709324791398), np.float64(64837.86641313063))

## Linear Model

In [29]:
# Train on a small subset to check if the model is working
X_train_smaller = X_train.head(1000)
y_train_smaller = y_train.head(1000)

### LinearModel_encodingdummyvariables

In [30]:
model_linear_smaller = LinearModel_encode_timestamps_dummy_variables()
model_linear_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_dummy_variables at 0x12733ebd0>

In [31]:
X_test_expanded = X_train_smaller.copy()
X_test_expanded['y_test_predicted'] = model_linear_smaller.predict(X_test_expanded)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,time,velocity,y_test_predicted,y_test,diff
0,ES,100901000206,169065712,2024-09-30 19:00:49+00:00,2024-09-30 19:00:52+00:00,CAR,2024-09-30 19:20:41.337000+00:00,2024-09-30 19:30:56+00:00,2024-09-30 19:38:11+00:00,38.370686,...,3147.419444,3147.419444,4371.550539,2024-09-30,ALC,614.663,5.120561,6.146630e+11,614.663,6.146630e+11
1,ES,100901487351,173633632,2024-09-30 21:54:58+00:00,2024-09-30 21:54:59+00:00,MOTORBIKE,2024-09-30 21:58:40.678000+00:00,2024-09-30 22:12:29+00:00,2024-09-30 22:16:54+00:00,38.342865,...,1959.901021,1959.901021,2228.776817,2024-09-30,ALC,828.322,2.366110,8.283220e+11,828.322,8.283220e+11
2,ES,100901615508,2320936,2024-09-30 23:17:55+00:00,2024-09-30 23:17:55+00:00,CAR,2024-09-30 23:25:52.013000+00:00,2024-09-30 23:34:12+00:00,2024-09-30 23:35:27+00:00,38.425022,...,3043.077176,3043.077176,4304.543962,2024-09-30,ALC,499.987,6.086313,4.999870e+11,499.987,4.999870e+11
3,ES,100900153723,174681565,2024-09-30 12:50:30+00:00,2024-09-30 12:50:31+00:00,MOTORBIKE,2024-09-30 12:51:54.134000+00:00,2024-09-30 12:53:27+00:00,2024-09-30 12:56:56+00:00,38.387657,...,384.769891,384.769891,538.640403,2024-09-30,ALC,92.866,4.143281,9.286600e+10,92.866,9.286600e+10
4,ES,100900235936,142421923,2024-09-30 13:23:50+00:00,2024-09-30 13:23:51+00:00,CAR,2024-09-30 13:49:36.097000+00:00,2024-09-30 13:57:33+00:00,2024-09-30 14:01:33+00:00,38.408985,...,2321.097615,2321.097615,2443.128342,2024-09-30,ALC,476.903,4.867022,4.769030e+11,476.903,4.769030e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ES,100901315231,146195464,2024-09-30 20:49:51+00:00,2024-09-30 20:49:52+00:00,CAR,2024-09-30 20:59:02.847000+00:00,2024-09-30 21:04:33+00:00,2024-09-30 21:05:24+00:00,38.362377,...,1485.176423,1485.176423,2054.798497,2024-09-30,ALC,330.153,4.498449,3.301530e+11,330.153,3.301530e+11
996,ES,100901448857,141248791,2024-09-30 21:38:14+00:00,2024-09-30 21:38:15+00:00,MOTORBIKE,2024-09-30 21:46:44.524000+00:00,2024-09-30 22:00:39+00:00,2024-09-30 22:01:02+00:00,38.362377,...,6872.882288,6872.882288,9730.640108,2024-09-30,ALC,834.476,8.236165,8.344760e+11,834.476,8.344760e+11
997,ES,100901466311,4623155,2024-09-30 21:45:32+00:00,2024-09-30 21:45:33+00:00,BICYCLE,2024-09-30 21:59:36.356000+00:00,2024-09-30 22:11:08+00:00,2024-09-30 22:15:15+00:00,38.363693,...,2490.755136,2490.755136,3220.661498,2024-09-30,ALC,691.644,3.601210,6.916440e+11,691.644,6.916440e+11
998,ES,100901492373,168086136,2024-09-30 21:57:16+00:00,2024-09-30 21:57:17+00:00,CAR,2024-09-30 22:13:17.522000+00:00,2024-09-30 22:20:54+00:00,2024-09-30 22:24:49+00:00,38.345390,...,1772.502372,1772.502372,2455.201727,2024-09-30,ALC,456.478,3.882996,4.564780e+11,456.478,4.564780e+11


In [32]:
model_linear_smaller.predict(X_test.iloc[[0]])

array([-0.07148238])

#### Train on 3 days of data

In [33]:
# We take the last week of the dataset to test the model
days_for_train = 3
days_for_test = 1
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Test date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-03 | Test date: 2024-10-20


In [34]:
X_train_3d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
y_train_3d = X_train_3d['delivery_entering_timestamp'] - X_train_3d['pickup_timestamp']
y_train_3d = pd.Series(y_train_3d, name='pickup_to_delivery')
X_test_3d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '=', end_date)])
y_test_3d = (X_test_3d['delivery_entering_timestamp'] - X_test_3d['pickup_timestamp']).dt.total_seconds()
y_test_3d = pd.Series(y_test_3d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 3d shapes: ", X_train_3d.shape, y_train_3d.shape)
print("Test datasets 3d shapes: ", X_test_3d.shape, y_test_3d.shape)

Train datasets 3d shapes:  (130457, 23) (130457,)
Test datasets 3d shapes:  (56149, 23) (56149,)


In [35]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 16:58:49.742949


In [36]:
model_linear_3d = LinearModel_encode_timestamps_dummy_variables()
model_linear_3d.fit(X_train_3d, y_train_3d)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_dummy_variables at 0x12733f1d0>

In [37]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_3d_time = end - start
print(f"Time elapsed: {model_linear_3d_time}")

End time: 2025-04-16 16:58:50.310613
Time elapsed: 0:00:00.567664


In [38]:
model_linear_3d_eval = model_linear_3d.evaluate(X_test_3d, y_test_3d)
model_linear_3d_eval

(np.float64(158877065133.5584), np.float64(5.021871082909959e+22))

#### Train on a week of data

In [39]:
# We take the last week of the dataset to test the model
days_for_train = 7
days_for_test = 3
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-07 | Begin test date: 2024-10-18 | End date: 2024-10-20


In [40]:
X_train_7d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
y_train_7d = X_train_7d['delivery_entering_timestamp'] - X_train_7d['pickup_timestamp']
y_train_7d = pd.Series(y_train_7d, name='pickup_to_delivery')
X_test_7d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
y_test_7d = (X_test_7d['delivery_entering_timestamp'] - X_test_7d['pickup_timestamp']).dt.total_seconds()
y_test_7d = pd.Series(y_test_7d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 7d shapes: ", X_train_7d.shape, y_train_7d.shape)
print("Test datasets 7d shapes: ", X_test_7d.shape, y_test_7d.shape)

Train datasets 7d shapes:  (345313, 23) (345313,)
Test datasets 7d shapes:  (169275, 23) (169275,)


In [41]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 16:58:50.745966


In [44]:
model_linear_7d = LinearModel_encode_timestamps_dummy_variables()
model_linear_7d.fit(X_train_7d, y_train_7d)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_dummy_variables at 0x1273a07a0>

In [45]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_7d_time = end - start
print(f"Time elapsed: {model_linear_7d_time}")

End time: 2025-04-16 17:01:22.740468
Time elapsed: 0:02:31.994502


In [46]:
model_linear_7d_eval = model_linear_7d.evaluate(X_test_7d, y_test_7d)
model_linear_7d_eval

(np.float64(368782322771.5103), np.float64(1.7798054292709185e+23))

#### Train on full data

In [47]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 17:01:23.175929


In [48]:
model_linear = LinearModel_encode_timestamps_dummy_variables()
model_linear.fit(X_train, y_train)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_dummy_variables at 0x12733ea20>

In [49]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_time = end - start
print(f"Time elapsed: {model_linear_time}")

End time: 2025-04-16 17:01:25.688226
Time elapsed: 0:00:02.512297


In [50]:
model_linear_eval = model_linear.evaluate(X_test, y_test)
model_linear_eval

(np.float64(448.9642176416416), np.float64(282205.13412409485))

### LinearModelSGD_encodingdummyvariables

In [51]:
model_linear_SGD_smaller = LinearModelSGD_encode_timestamps_dummy_variables()
model_linear_SGD_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModelSGD_encode_timestamps_dummy_variables at 0x1273a2ab0>

In [52]:
X_test_expanded = X_train_smaller.copy()
X_test_expanded['y_test_predicted'] = model_linear_SGD_smaller.predict(X_test_expanded)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,time,velocity,y_test_predicted,y_test,diff
0,ES,100901000206,169065712,2024-09-30 19:00:49+00:00,2024-09-30 19:00:52+00:00,CAR,2024-09-30 19:20:41.337000+00:00,2024-09-30 19:30:56+00:00,2024-09-30 19:38:11+00:00,38.370686,...,3147.419444,3147.419444,4371.550539,2024-09-30,ALC,614.663,5.120561,6.147100e+11,614.663,6.147100e+11
1,ES,100901487351,173633632,2024-09-30 21:54:58+00:00,2024-09-30 21:54:59+00:00,MOTORBIKE,2024-09-30 21:58:40.678000+00:00,2024-09-30 22:12:29+00:00,2024-09-30 22:16:54+00:00,38.342865,...,1959.901021,1959.901021,2228.776817,2024-09-30,ALC,828.322,2.366110,8.282348e+11,828.322,8.282347e+11
2,ES,100901615508,2320936,2024-09-30 23:17:55+00:00,2024-09-30 23:17:55+00:00,CAR,2024-09-30 23:25:52.013000+00:00,2024-09-30 23:34:12+00:00,2024-09-30 23:35:27+00:00,38.425022,...,3043.077176,3043.077176,4304.543962,2024-09-30,ALC,499.987,6.086313,5.000207e+11,499.987,5.000207e+11
3,ES,100900153723,174681565,2024-09-30 12:50:30+00:00,2024-09-30 12:50:31+00:00,MOTORBIKE,2024-09-30 12:51:54.134000+00:00,2024-09-30 12:53:27+00:00,2024-09-30 12:56:56+00:00,38.387657,...,384.769891,384.769891,538.640403,2024-09-30,ALC,92.866,4.143281,9.290172e+10,92.866,9.290172e+10
4,ES,100900235936,142421923,2024-09-30 13:23:50+00:00,2024-09-30 13:23:51+00:00,CAR,2024-09-30 13:49:36.097000+00:00,2024-09-30 13:57:33+00:00,2024-09-30 14:01:33+00:00,38.408985,...,2321.097615,2321.097615,2443.128342,2024-09-30,ALC,476.903,4.867022,4.769593e+11,476.903,4.769592e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ES,100901315231,146195464,2024-09-30 20:49:51+00:00,2024-09-30 20:49:52+00:00,CAR,2024-09-30 20:59:02.847000+00:00,2024-09-30 21:04:33+00:00,2024-09-30 21:05:24+00:00,38.362377,...,1485.176423,1485.176423,2054.798497,2024-09-30,ALC,330.153,4.498449,3.301438e+11,330.153,3.301438e+11
996,ES,100901448857,141248791,2024-09-30 21:38:14+00:00,2024-09-30 21:38:15+00:00,MOTORBIKE,2024-09-30 21:46:44.524000+00:00,2024-09-30 22:00:39+00:00,2024-09-30 22:01:02+00:00,38.362377,...,6872.882288,6872.882288,9730.640108,2024-09-30,ALC,834.476,8.236165,8.344987e+11,834.476,8.344987e+11
997,ES,100901466311,4623155,2024-09-30 21:45:32+00:00,2024-09-30 21:45:33+00:00,BICYCLE,2024-09-30 21:59:36.356000+00:00,2024-09-30 22:11:08+00:00,2024-09-30 22:15:15+00:00,38.363693,...,2490.755136,2490.755136,3220.661498,2024-09-30,ALC,691.644,3.601210,6.915882e+11,691.644,6.915882e+11
998,ES,100901492373,168086136,2024-09-30 21:57:16+00:00,2024-09-30 21:57:17+00:00,CAR,2024-09-30 22:13:17.522000+00:00,2024-09-30 22:20:54+00:00,2024-09-30 22:24:49+00:00,38.345390,...,1772.502372,1772.502372,2455.201727,2024-09-30,ALC,456.478,3.882996,4.564319e+11,456.478,4.564319e+11


In [53]:
model_linear_SGD_smaller.predict(X_test.iloc[[0]])

array([-2.20773416e+10])

#### Train on 3 days of data

In [54]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 17:01:26.748369


In [55]:
model_linear_SGD_3d = LinearModelSGD_encode_timestamps_dummy_variables()
model_linear_SGD_3d.fit(X_train_3d, y_train_3d)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModelSGD_encode_timestamps_dummy_variables at 0x1273a3e00>

In [56]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_3d_time = end - start
print(f"Time elapsed: {model_linear_SGD_3d_time}")

End time: 2025-04-16 17:01:28.675566
Time elapsed: 0:00:01.927197


In [57]:
model_linear_SGD_3d_eval = model_linear_SGD_3d.evaluate(X_test_3d, y_test_3d)
model_linear_SGD_3d_eval

(np.float64(247012095679.81055), np.float64(1.047144264142921e+23))

#### Train on a week of data

In [58]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 17:01:28.890246


In [59]:
model_linear_SGD_7d = LinearModelSGD_encode_timestamps_dummy_variables()
model_linear_SGD_7d.fit(X_train_7d, y_train_7d)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModelSGD_encode_timestamps_dummy_variables at 0x12736fb00>

In [60]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_7d_time = end - start
print(f"Time elapsed: {model_linear_SGD_7d_time}")

End time: 2025-04-16 17:01:36.343159
Time elapsed: 0:00:07.452913


In [61]:
model_linear_SGD_7d_eval = model_linear_SGD_7d.evaluate(X_test_7d, y_test_7d)
model_linear_SGD_7d_eval

(np.float64(294733988279.1774), np.float64(1.2478339271760248e+23))

#### Train on full data

In [62]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 17:01:36.864852


In [63]:
model_linear_SGD = LinearModelSGD_encode_timestamps_dummy_variables()
model_linear_SGD.fit(X_train, y_train)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModelSGD_encode_timestamps_dummy_variables at 0x126ff7b30>

In [64]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_time = end - start
print(f"Time elapsed: {model_linear_SGD_time}")

End time: 2025-04-16 17:01:41.089952
Time elapsed: 0:00:04.225100


In [65]:
model_linear_SGD_eval = model_linear_SGD.evaluate(X_test, y_test)
model_linear_SGD_eval

(np.float64(1977032564.3140514), np.float64(4.542000142776821e+18))

### LinearModel_cyclical_encoding

In [66]:
model_linear_cyclical_smaller = LinearModel_encode_timestamps_cyclical()
model_linear_cyclical_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_cyclical at 0x126ff7530>

In [67]:
X_test_expanded = X_train_smaller.copy()
X_test_expanded['y_test_predicted'] = model_linear_cyclical_smaller.predict(X_test_expanded)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,time,velocity,y_test_predicted,y_test,diff
0,ES,100901000206,169065712,2024-09-30 19:00:49+00:00,2024-09-30 19:00:52+00:00,CAR,2024-09-30 19:20:41.337000+00:00,2024-09-30 19:30:56+00:00,2024-09-30 19:38:11+00:00,38.370686,...,3147.419444,3147.419444,4371.550539,2024-09-30,ALC,614.663,5.120561,6.146630e+11,614.663,6.146630e+11
1,ES,100901487351,173633632,2024-09-30 21:54:58+00:00,2024-09-30 21:54:59+00:00,MOTORBIKE,2024-09-30 21:58:40.678000+00:00,2024-09-30 22:12:29+00:00,2024-09-30 22:16:54+00:00,38.342865,...,1959.901021,1959.901021,2228.776817,2024-09-30,ALC,828.322,2.366110,8.283220e+11,828.322,8.283220e+11
2,ES,100901615508,2320936,2024-09-30 23:17:55+00:00,2024-09-30 23:17:55+00:00,CAR,2024-09-30 23:25:52.013000+00:00,2024-09-30 23:34:12+00:00,2024-09-30 23:35:27+00:00,38.425022,...,3043.077176,3043.077176,4304.543962,2024-09-30,ALC,499.987,6.086313,4.999870e+11,499.987,4.999870e+11
3,ES,100900153723,174681565,2024-09-30 12:50:30+00:00,2024-09-30 12:50:31+00:00,MOTORBIKE,2024-09-30 12:51:54.134000+00:00,2024-09-30 12:53:27+00:00,2024-09-30 12:56:56+00:00,38.387657,...,384.769891,384.769891,538.640403,2024-09-30,ALC,92.866,4.143281,9.286600e+10,92.866,9.286600e+10
4,ES,100900235936,142421923,2024-09-30 13:23:50+00:00,2024-09-30 13:23:51+00:00,CAR,2024-09-30 13:49:36.097000+00:00,2024-09-30 13:57:33+00:00,2024-09-30 14:01:33+00:00,38.408985,...,2321.097615,2321.097615,2443.128342,2024-09-30,ALC,476.903,4.867022,4.769030e+11,476.903,4.769030e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ES,100901315231,146195464,2024-09-30 20:49:51+00:00,2024-09-30 20:49:52+00:00,CAR,2024-09-30 20:59:02.847000+00:00,2024-09-30 21:04:33+00:00,2024-09-30 21:05:24+00:00,38.362377,...,1485.176423,1485.176423,2054.798497,2024-09-30,ALC,330.153,4.498449,3.301530e+11,330.153,3.301530e+11
996,ES,100901448857,141248791,2024-09-30 21:38:14+00:00,2024-09-30 21:38:15+00:00,MOTORBIKE,2024-09-30 21:46:44.524000+00:00,2024-09-30 22:00:39+00:00,2024-09-30 22:01:02+00:00,38.362377,...,6872.882288,6872.882288,9730.640108,2024-09-30,ALC,834.476,8.236165,8.344760e+11,834.476,8.344760e+11
997,ES,100901466311,4623155,2024-09-30 21:45:32+00:00,2024-09-30 21:45:33+00:00,BICYCLE,2024-09-30 21:59:36.356000+00:00,2024-09-30 22:11:08+00:00,2024-09-30 22:15:15+00:00,38.363693,...,2490.755136,2490.755136,3220.661498,2024-09-30,ALC,691.644,3.601210,6.916440e+11,691.644,6.916440e+11
998,ES,100901492373,168086136,2024-09-30 21:57:16+00:00,2024-09-30 21:57:17+00:00,CAR,2024-09-30 22:13:17.522000+00:00,2024-09-30 22:20:54+00:00,2024-09-30 22:24:49+00:00,38.345390,...,1772.502372,1772.502372,2455.201727,2024-09-30,ALC,456.478,3.882996,4.564780e+11,456.478,4.564780e+11


In [68]:
model_linear_cyclical_smaller.predict(X_test.iloc[[0]])

array([-0.0773096])

#### Train on full data

In [69]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-04-16 17:01:42.605014


In [70]:
model_linear_cyclical = LinearModel_encode_timestamps_cyclical()
model_linear_cyclical.fit(X_train, y_train)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_cyclical at 0x12815c740>

In [71]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_cyclical_time = end - start
print(f"Time elapsed: {model_linear_cyclical_time}")

End time: 2025-04-16 17:01:51.661304
Time elapsed: 0:00:09.056290


In [72]:
model_linear_cyclical_eval = model_linear_cyclical.evaluate(X_test, y_test)
model_linear_cyclical_eval

(np.float64(448.9668320386565), np.float64(282204.5640406292))

### LinearModel

Compare performance of the model with and without standardization

## Models comparison

In [82]:
models = ['model_bl_sum' , 'model_bl_mean', 'model_linear_3d', 'model_linear_7d', 'model_linear', 'model_linear_SGD_3d', 'model_linear_SGD_7d', 'model_linear_SGD', 'model_linear_cyclical']

data = []
for model in models:
    time_value = eval(model + '_time')
    print(f"Time to fit {model}: {time_value}")
    eval_value = eval(model + '_eval')
    print(f"Evaluation of {model}: {eval_value}")
    data.append([model, time_value / datetime.timedelta(milliseconds=1), eval_value[0], eval_value[1]])

models_eval = pd.DataFrame(data, columns=['Model', 'Training time [ms]', 'MAE', 'MSE'])
models_eval

Time to fit model_bl_sum: 0:00:00.578611
Evaluation of model_bl_sum: (np.float64(145.6280391037287), np.float64(52902.38788391615))
Time to fit model_bl_mean: 0:00:00.334458
Evaluation of model_bl_mean: (np.float64(161.23709324791398), np.float64(64837.86641313063))
Time to fit model_linear_3d: 0:00:00.567664
Evaluation of model_linear_3d: (np.float64(158877065133.5584), np.float64(5.021871082909959e+22))
Time to fit model_linear_7d: 0:02:31.994502
Evaluation of model_linear_7d: (np.float64(368782322771.5103), np.float64(1.7798054292709185e+23))
Time to fit model_linear: 0:00:02.512297
Evaluation of model_linear: (np.float64(448.9642176416416), np.float64(282205.13412409485))
Time to fit model_linear_SGD_3d: 0:00:01.927197
Evaluation of model_linear_SGD_3d: (np.float64(247012095679.81055), np.float64(1.047144264142921e+23))
Time to fit model_linear_SGD_7d: 0:00:07.452913
Evaluation of model_linear_SGD_7d: (np.float64(294733988279.1774), np.float64(1.2478339271760248e+23))
Time to fit m

Unnamed: 0,Model,Training time [ms],MAE,MSE
0,model_bl_sum,578.61,145.63,52902.39
1,model_bl_mean,334.46,161.24,64837.87
2,model_linear_3d,567.66,158877065133.56,5.021871082909959e+22
3,model_linear_7d,151994.5,368782322771.51,1.7798054292709185e+23
4,model_linear,2512.3,448.96,282205.13
5,model_linear_SGD_3d,1927.2,247012095679.81,1.047144264142921e+23
6,model_linear_SGD_7d,7452.91,294733988279.18,1.2478339271760248e+23
7,model_linear_SGD,4225.1,1977032564.31,4.5420001427768207e+18
8,model_linear_cyclical,9056.29,448.97,282204.56
