# Pickup to Delivery Overall

In [1]:
import os
import sys
import datetime
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import numpy as np
import pickle

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils
from estimator import BaselineModel_sum, BaselineModel_mean, LinearModel

In [2]:
base_query_path = './queries/'
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
datalake_connection = qr.create_connection(db='datalake')
#monolith_connection = qr.create_connection(user=livedb_config['monolith_username'], password=livedb_config['monolith_password'], db='livedb')
#dispatching_db_connection = qr.create_connection(user=livedb_config['dispatching_db_username'], password=livedb_config['dispatching_db_password'], db='dispatchingdb')

INFO:trino.auth:keyring module not found. OAuth2 token will not be stored in keyring.


In [3]:
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']
cities = parameters_config['cities']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code} | Cities: {cities}')

Start date: 2024-09-30 | End date: 2024-10-20 | Countries: ES | Cities: 'MAD', 'BCN', 'SEV', 'ALC'


In [4]:
parameters = {
    'start_date': start_date,
    'end_date': end_date,
    'country_code': country_code,
    'cities': cities
}

## Load the dataset

In [5]:
data = pd.read_parquet("data/parquet/dataframe.parquet")

## Hyperparameters

In [6]:
test_set_perc = 0.1
days_for_test = 7
k_cv = 5

## Database split

As we have partitioned the data by city and creation date, we can use this information to split the data. This will help to avoid data leakage, as we will not have data from the future in the training set.
This is much better than just sorting the data by the creation timestamp and taking 10% of the dataset as test set, as we did before.

In [7]:
# We take the last week of the dataset to test the model
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | Begin test date: 2024-10-14 | End date: 2024-10-20


In [8]:
data_train = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', begin_test_date)])
data_train.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100901465337,169809943,2024-09-30 21:45:07+00:00,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,...,2024-09-30 22:10:01+00:00,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9,2024-09-30,ALC
1,ES,100901489101,9516729,2024-09-30 21:55:46+00:00,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,...,2024-09-30 22:09:18+00:00,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52,2024-09-30,ALC
2,ES,100899913152,172855743,2024-09-30 10:30:55+00:00,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,...,2024-09-30 10:49:56+00:00,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46,2024-09-30,ALC
3,ES,100900447439,170201413,2024-09-30 14:46:15+00:00,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,...,2024-09-30 15:11:36+00:00,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04,2024-09-30,ALC
4,ES,100900529830,176424631,2024-09-30 15:22:32+00:00,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,...,2024-09-30 15:56:07+00:00,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24,2024-09-30,ALC


In [9]:
# Check that there are no nulls deriving from a wrong writing of parquet files (appending instead of overwriting)
data_train.isnull().sum().sum()

np.int64(0)

We will only use the feature `activation_timestamp`, as we are simulating being Glovo Jarvis engine that has to estimate the PDO time in order to decide to assign an order to a specific courier. The column `creation_timestamp` is threfore redundant and we will not include it in the model.

In [10]:
train_columns = ['country_code', 'city_code', 'order_id', 'courier_id', 'activation_timestamp', 'transport', 'pickup_latitude', 'pickup_longitude',
                 'delivery_latitude', 'delivery_longitude', 'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad', 'delivery_latitude_rad',
                 'delivery_longitude_rad', 'pd_distance_haversine_m', 'pd_distance_haversine_m_sk', 'pd_distance_manhattan_m']
X_train = data_train[train_columns].copy()
X_train.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100901465337,169809943,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9
1,ES,ALC,100901489101,9516729,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,Europe/Madrid,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52
2,ES,ALC,100899913152,172855743,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46
3,ES,ALC,100900447439,170201413,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,Europe/Madrid,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04
4,ES,ALC,100900529830,176424631,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24


In [11]:
y_train = (data_train['delivery_timestamp'] - data_train['pickup_timestamp']).dt.total_seconds()
y_train = pd.Series(y_train, name='pickup_to_delivery')
y_train.head()

0   564.31
1   464.86
2   511.03
3   906.38
4   680.04
Name: pickup_to_delivery, dtype: float64

In [12]:
y_train.isnull().sum().sum()

np.int64(0)

In [13]:
data_test = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
data_test.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100933187613,176241347,2024-10-14 23:24:53+00:00,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,2024-10-14 23:45:58+00:00,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,2024-10-14,ALC
1,ES,100933198445,10191824,2024-10-14 23:37:45+00:00,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,2024-10-15 00:04:30+00:00,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,2024-10-14,ALC
2,ES,100931749423,169099229,2024-10-14 12:57:37+00:00,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,2024-10-14 13:22:05+00:00,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,2024-10-14,ALC
3,ES,100931867858,177428955,2024-10-14 13:44:05+00:00,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,2024-10-14 14:26:28+00:00,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,2024-10-14,ALC
4,ES,100931917343,3548605,2024-10-14 14:03:19+00:00,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,2024-10-14 14:28:00+00:00,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,2024-10-14,ALC


In [14]:
X_test = data_test[train_columns].copy()
X_test.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43


In [15]:
X_test.isnull().sum().sum()

np.int64(0)

In [16]:
y_test = (data_test['delivery_timestamp'] - data_test['pickup_timestamp']).dt.total_seconds()
y_test = pd.Series(y_test, dtype=np.float64, name='pickup_to_delivery')
y_test.head()

0    319.68
1    811.65
2    614.70
3   1111.15
4    806.57
Name: pickup_to_delivery, dtype: float64

In [17]:
y_test.isnull().sum().sum()

np.int64(0)

In [18]:
print("Train datasets shapes: ", X_train.shape, y_train.shape)
print("Test datasets shapes: ", X_test.shape, y_test.shape)

Train datasets shapes:  (688815, 18) (688815,)
Test datasets shapes:  (354799, 18) (354799,)


## Baseline Models

### BaselineModel_sum

In [19]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:28:41.987450


In [20]:
model_bl_sum = BaselineModel_sum()
model_bl_sum.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_sum at 0x121b829f0>

In [21]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_sum_time = end - start
print(f"Time elapsed: {model_bl_sum_time}")

End time: 2025-05-19 18:28:42.418530
Time elapsed: 0:00:00.431080


In [22]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_sum.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 345.14675182584176
Diff: 25.46975182584174


In [23]:
results_dataset = X_test
results_dataset['y_test'] = y_test
results_dataset['y_test_predicted'] = model_bl_sum.predict(X_test)
results_dataset['diff'] = results_dataset['y_test_predicted'] - results_dataset['y_test']
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,y_test,y_test_predicted,diff
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,345.15,25.47
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,344.1,-467.55
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,862.35,247.65
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1425.92,314.76
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,266.35,-540.22
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2713.04,1607.68
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,364.35,73.8
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,1097.86,343.07
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,855.01,-621.79
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,916.85,294.82


### Evaluation pipeline

In [24]:
model_bl_sum_eval = model_bl_sum.evaluate(X_test, y_test)
model_bl_sum_eval

y_hat <class 'pandas.core.series.Series'> float64
y_test <class 'pandas.core.series.Series'> float64


(np.float64(256.3865388687307), np.float64(179628.8794954057))

### BaselineModel_mean

In [25]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:29:23.593411


In [26]:
model_bl_mean = BaselineModel_mean()
model_bl_mean.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_mean at 0x121bcff50>

In [27]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_mean_time = end - start
print(f"Time elapsed: {model_bl_mean_time}")

End time: 2025-05-19 18:29:23.893220
Time elapsed: 0:00:00.299809


In [28]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_mean.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 292.3004304172709
Diff: -27.376569582729132


In [29]:
results_dataset = X_test
results_dataset['y_test2'] = y_test
results_dataset['y_test_predicted'] = model_bl_mean.predict(X_test)
results_dataset['diff'] = results_dataset['y_test_predicted'] - results_dataset['y_test']
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,y_test,y_test_predicted,diff,y_test2
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,292.3,-27.38,319.68
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,281.54,-530.11,811.65
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,659.83,45.13,614.7
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1207.59,96.44,1111.15
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,203.8,-602.77,806.57
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2219.77,1114.41,1105.37
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,298.11,7.55,290.56
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,840.03,85.24,754.79
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,724.09,-752.71,1476.8
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,750.15,128.12,622.03


### Evaluation pipeline

In [30]:
model_bl_mean_eval = model_bl_mean.evaluate(X_test, y_test)
model_bl_mean_eval

(np.float64(239.81356923333743), np.float64(157993.39824654086))

## Linear Model

In [31]:
# Train on a small subset to check if the model is working
X_train_smaller = X_train.head(1000)
y_train_smaller = y_train.head(1000)

### Linear Model encoding dummy variables

In [32]:
model_linear_smaller = LinearModel(model_type='linear', encoding='dummy')
model_linear_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 19), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x121bccad0>

In [33]:
y_test_0 = y_train_smaller.loc[0]
y_test_0_pred = model_linear_smaller.predict(X_train_smaller.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 564.308
Predicted delivery time: 781.9825627692044
Diff: 217.67456276920439


#### Train on 3 days of data

In [34]:
# We take the last week of the dataset to test the model
days_for_train = 3
days_for_test = 1
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Test date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-03 | Test date: 2024-10-20


In [35]:
data_train_3d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_3d = data_train_3d[train_columns].copy()
y_train_3d = y_train[X_train_3d.index]
y_train_3d = pd.Series(y_train_3d, name='pickup_to_delivery')
data_test_3d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '=', end_date)])
X_test_3d = data_test_3d[train_columns].copy()
y_test_3d = y_test[X_test_3d.index]
y_test_3d = pd.Series(y_test_3d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 3d shapes: ", X_train_3d.shape, y_train_3d.shape)
print("Test datasets 3d shapes: ", X_test_3d.shape, y_test_3d.shape)

Train datasets 3d shapes:  (132533, 18) (132533,)
Test datasets 3d shapes:  (56857, 18) (56857,)


In [36]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:29:59.959567


In [37]:
model_linear_3d = LinearModel(model_type='linear', encoding='dummy')
model_linear_3d.fit(X_train_3d, y_train_3d)

INFO:root:Train datasets shapes: X: (132533, 18), y: (132533,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (132533, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x121bcfec0>

In [38]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_3d_time = end - start
print(f"Time elapsed: {model_linear_3d_time}")

End time: 2025-05-19 18:30:00.247373
Time elapsed: 0:00:00.287806


In [39]:
model_linear_3d_eval = model_linear_3d.evaluate(X_test_3d, y_test_3d)
model_linear_3d_eval

(np.float64(4850.462204065004), np.float64(23788905.201602723))

#### Train on a week of data

In [40]:
# We take the last week of the dataset to test the model
days_for_train = 7
days_for_test = 3
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-07 | Begin test date: 2024-10-18 | End date: 2024-10-20


In [41]:
data_train_7d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_7d = data_train_7d[train_columns].copy()
y_train_7d = y_train[X_train_7d.index]
y_train_7d = pd.Series(y_train_7d, name='pickup_to_delivery')
data_test_7d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
X_test_7d = data_test_7d[train_columns].copy()
y_test_7d = y_test[X_test_7d.index]
y_test_7d = pd.Series(y_test_7d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 7d shapes: ", X_train_7d.shape, y_train_7d.shape)
print("Test datasets 7d shapes: ", X_test_7d.shape, y_test_7d.shape)

Train datasets 7d shapes:  (350485, 18) (350485,)
Test datasets 7d shapes:  (171489, 18) (171489,)


In [42]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:00.810731


In [43]:
model_linear_7d = LinearModel(model_type='linear', encoding='dummy')
model_linear_7d.fit(X_train_7d, y_train_7d)

INFO:root:Train datasets shapes: X: (350485, 18), y: (350485,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (350485, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x121b82390>

In [44]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_7d_time = end - start
print(f"Time elapsed: {model_linear_7d_time}")

End time: 2025-05-19 18:30:01.600832
Time elapsed: 0:00:00.790101


In [45]:
model_linear_7d_eval = model_linear_7d.evaluate(X_test_7d, y_test_7d)
model_linear_7d_eval

(np.float64(300.4289466948376), np.float64(198128.49677930487))

#### Train on full data

In [46]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:01.823958


In [47]:
model_linear = LinearModel(model_type='linear', encoding='dummy')
model_linear.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e6282f0>

In [48]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_time = end - start
print(f"Time elapsed: {model_linear_time}")

End time: 2025-05-19 18:30:03.283152
Time elapsed: 0:00:01.459194


In [49]:
results_dataset = X_test
results_dataset['y_test'] = y_test
results_dataset['y_test_predicted'] = model_linear.predict(X_test)
results_dataset['diff'] = results_dataset['y_test_predicted'] - results_dataset['y_test']
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,y_test,y_test_predicted,diff,y_test2
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,425.07,105.39,319.68
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,413.37,-398.27,811.65
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,731.36,116.66,614.7
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,894.0,-217.16,1111.15
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,399.77,-406.79,806.57
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1540.69,435.33,1105.37
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.36,146.8,290.56
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,861.22,106.43,754.79
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,628.16,-848.64,1476.8
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,694.49,72.46,622.03


In [50]:
model_linear_eval = model_linear.evaluate(X_test, y_test)
model_linear_eval

(np.float64(183.02056888939788), np.float64(114183.02259706463))

### Linear Model SGD encoding dummy variables

In [51]:
model_linear_SGD_smaller = LinearModel(model_type='SGD', encoding='dummy')
model_linear_SGD_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 19), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e62acc0>

In [52]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_SGD_smaller.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: -3035.0966353792974
Diff: -3354.7736353792975


#### Train on 3 days of data

In [53]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:04.305174


In [54]:
model_linear_SGD_3d = LinearModel(model_type='sgd', encoding='dummy')
model_linear_SGD_3d.fit(X_train_3d, y_train_3d)

INFO:root:Train datasets shapes: X: (132533, 18), y: (132533,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (132533, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e62a990>

In [55]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_3d_time = end - start
print(f"Time elapsed: {model_linear_SGD_3d_time}")

End time: 2025-05-19 18:30:06.216894
Time elapsed: 0:00:01.911720


In [56]:
model_linear_SGD_3d_eval = model_linear_SGD_3d.evaluate(X_test_3d, y_test_3d)
model_linear_SGD_3d_eval

(np.float64(4719.729311191085), np.float64(22534708.082978602))

#### Train on a week of data

In [57]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:06.374374


In [58]:
model_linear_SGD_7d = LinearModel(model_type='SGD', encoding='dummy')
model_linear_SGD_7d.fit(X_train_7d, y_train_7d)

INFO:root:Train datasets shapes: X: (350485, 18), y: (350485,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (350485, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e629fa0>

In [59]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_7d_time = end - start
print(f"Time elapsed: {model_linear_SGD_7d_time}")

End time: 2025-05-19 18:30:11.283852
Time elapsed: 0:00:04.909478


In [60]:
model_linear_SGD_7d_eval = model_linear_SGD_7d.evaluate(X_test_7d, y_test_7d)
model_linear_SGD_7d_eval

(np.float64(515.492814868408), np.float64(426420.5839580514))

#### Train on full data

In [61]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:11.664485


In [62]:
model_linear_SGD = LinearModel(model_type='sgd', encoding='dummy')
model_linear_SGD.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e628050>

In [63]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_time = end - start
print(f"Time elapsed: {model_linear_SGD_time}")

End time: 2025-05-19 18:30:16.511512
Time elapsed: 0:00:04.847027


In [64]:
results_dataset = X_test
results_dataset['y_test'] = y_test
results_dataset['y_test_predicted'] = model_linear_SGD.predict(X_test)
results_dataset['diff'] = results_dataset['y_test_predicted'] - results_dataset['y_test']
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,y_test,y_test_predicted,diff,y_test2
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,416.75,97.08,319.68
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,388.64,-423.01,811.65
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,712.25,97.55,614.7
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,854.13,-257.03,1111.15
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,369.99,-436.58,806.57
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1535.39,430.02,1105.37
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,446.62,156.06,290.56
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,793.45,38.66,754.79
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,608.87,-867.93,1476.8
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,661.65,39.62,622.03


In [65]:
model_linear_SGD_eval = model_linear_SGD.evaluate(X_test, y_test)
model_linear_SGD_eval

(np.float64(179.2541895061415), np.float64(116573.86706896461))

### LinearModel cyclical encoding

In [66]:
model_linear_cyclical_smaller = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 19), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e62bad0>

In [67]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_cyclical_smaller.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: -2221.862971879542
Diff: -2541.539971879542


#### Train on full data

In [68]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-05-19 18:30:18.204591


In [69]:
model_linear_cyclical = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12e62ab40>

In [70]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_cyclical_time = end - start
print(f"Time elapsed: {model_linear_cyclical_time}")

End time: 2025-05-19 18:30:20.927735
Time elapsed: 0:00:02.723144


In [71]:
results_dataset = X_test
results_dataset['y_test'] = y_test
results_dataset['y_test_predicted'] = model_linear_cyclical.predict(X_test)
results_dataset['diff'] = results_dataset['y_test_predicted'] - results_dataset['y_test']
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,y_test,y_test_predicted,diff,y_test2
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,384.36,64.68,319.68
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,370.8,-440.85,811.65
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,734.68,119.98,614.7
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,899.01,-212.15,1111.15
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,402.98,-403.58,806.57
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1550.03,444.67,1105.37
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.06,146.5,290.56
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,847.54,92.75,754.79
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,602.05,-874.75,1476.8
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,661.98,39.94,622.03


In [72]:
model_linear_cyclical_eval = model_linear_cyclical.evaluate(X_test, y_test)
model_linear_cyclical_eval

(np.float64(183.28648174613807), np.float64(122189.95765097688))

### LinearModel

Compare performance of the model with and without standardization

## Models comparison

In [73]:
models = ['model_bl_sum' , 'model_bl_mean', 'model_linear_3d', 'model_linear_7d', 'model_linear', 'model_linear_SGD_3d', 'model_linear_SGD_7d', 'model_linear_SGD', 'model_linear_cyclical']

data = []
for model in models:
    time_value = eval(model + '_time')
    print(f"Time to fit {model}: {time_value}")
    eval_value = eval(model + '_eval')
    print(f"Evaluation of {model}: {eval_value}")
    data.append([model, time_value / datetime.timedelta(milliseconds=1), eval_value[0], eval_value[1]])

models_eval = pd.DataFrame(data, columns=['Model', 'Training time [ms]', 'MAE', 'MSE'])
models_eval

Time to fit model_bl_sum: 0:00:00.431080
Evaluation of model_bl_sum: (np.float64(256.3865388687307), np.float64(179628.8794954057))
Time to fit model_bl_mean: 0:00:00.299809
Evaluation of model_bl_mean: (np.float64(239.81356923333743), np.float64(157993.39824654086))
Time to fit model_linear_3d: 0:00:00.287806
Evaluation of model_linear_3d: (np.float64(4850.462204065004), np.float64(23788905.201602723))
Time to fit model_linear_7d: 0:00:00.790101
Evaluation of model_linear_7d: (np.float64(300.4289466948376), np.float64(198128.49677930487))
Time to fit model_linear: 0:00:01.459194
Evaluation of model_linear: (np.float64(183.02056888939788), np.float64(114183.02259706463))
Time to fit model_linear_SGD_3d: 0:00:01.911720
Evaluation of model_linear_SGD_3d: (np.float64(4719.729311191085), np.float64(22534708.082978602))
Time to fit model_linear_SGD_7d: 0:00:04.909478
Evaluation of model_linear_SGD_7d: (np.float64(515.492814868408), np.float64(426420.5839580514))
Time to fit model_linear_SGD

Unnamed: 0,Model,Training time [ms],MAE,MSE
0,model_bl_sum,431.08,256.39,179628.88
1,model_bl_mean,299.81,239.81,157993.4
2,model_linear_3d,287.81,4850.46,23788905.2
3,model_linear_7d,790.1,300.43,198128.5
4,model_linear,1459.19,183.02,114183.02
5,model_linear_SGD_3d,1911.72,4719.73,22534708.08
6,model_linear_SGD_7d,4909.48,515.49,426420.58
7,model_linear_SGD,4847.03,179.25,116573.87
8,model_linear_cyclical,2723.14,183.29,122189.96
