# Pickup to Delivery Overall

In [1]:
import os
import sys
import datetime
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import numpy as np

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils
from estimator import BaselineModel_sum, BaselineModel_mean, LinearModel, RegressionTreeMethod

In [2]:
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']
cities = parameters_config['cities']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code} | Cities: {cities}')

Start date: 2024-09-30 | End date: 2024-10-20 | Countries: ES | Cities: 'MAD', 'BCN', 'SEV', 'ALC'


## Load the dataset

In [3]:
data = pd.read_parquet("data/parquet/dataframe.parquet")

## Hyperparameters

In [4]:
test_set_perc = 0.1
days_for_test = 7
k_cv = 5

## Database split

As we have partitioned the data by city and creation date, we can use this information to split the data. This will help to avoid data leakage, as we will not have data from the future in the training set.
This is much better than just sorting the data by the creation timestamp and taking 10% of the dataset as test set, as we did before.

In [5]:
# We take the last week of the dataset to test the model
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | Begin test date: 2024-10-14 | End date: 2024-10-20


In [6]:
data_train = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', begin_test_date)])
data_train.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100901465337,169809943,2024-09-30 21:45:07+00:00,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,...,2024-09-30 22:10:01+00:00,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9,2024-09-30,ALC
1,ES,100901489101,9516729,2024-09-30 21:55:46+00:00,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,...,2024-09-30 22:09:18+00:00,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52,2024-09-30,ALC
2,ES,100899913152,172855743,2024-09-30 10:30:55+00:00,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,...,2024-09-30 10:49:56+00:00,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46,2024-09-30,ALC
3,ES,100900447439,170201413,2024-09-30 14:46:15+00:00,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,...,2024-09-30 15:11:36+00:00,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04,2024-09-30,ALC
4,ES,100900529830,176424631,2024-09-30 15:22:32+00:00,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,...,2024-09-30 15:56:07+00:00,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24,2024-09-30,ALC


In [7]:
# Check that there are no nulls deriving from a wrong writing of parquet files (appending instead of overwriting)
data_train.isnull().sum().sum()

np.int64(0)

We will only use the feature `activation_timestamp`, as we are simulating being Glovo Jarvis engine that has to estimate the PDO time in order to decide to assign an order to a specific courier. The column `creation_timestamp` is threfore redundant and we will not include it in the model.

In [8]:
train_columns = ['country_code', 'city_code', 'order_id', 'courier_id', 'activation_timestamp', 'transport', 'pickup_latitude', 'pickup_longitude',
                 'delivery_latitude', 'delivery_longitude', 'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad', 'delivery_latitude_rad',
                 'delivery_longitude_rad', 'pd_distance_haversine_m', 'pd_distance_haversine_m_sk', 'pd_distance_manhattan_m']
X_train = data_train[train_columns].copy()
X_train.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100901465337,169809943,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9
1,ES,ALC,100901489101,9516729,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,Europe/Madrid,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52
2,ES,ALC,100899913152,172855743,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46
3,ES,ALC,100900447439,170201413,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,Europe/Madrid,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04
4,ES,ALC,100900529830,176424631,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24


In [9]:
y_train = (data_train['delivery_timestamp'] - data_train['pickup_timestamp']).dt.total_seconds()
y_train = pd.Series(y_train, name='pickup_to_delivery')
y_train.head()

0   564.31
1   464.86
2   511.03
3   906.38
4   680.04
Name: pickup_to_delivery, dtype: float64

In [10]:
y_train.isnull().sum().sum()

np.int64(0)

In [11]:
data_test = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
data_test.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100933187613,176241347,2024-10-14 23:24:53+00:00,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,2024-10-14 23:45:58+00:00,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,2024-10-14,ALC
1,ES,100933198445,10191824,2024-10-14 23:37:45+00:00,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,2024-10-15 00:04:30+00:00,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,2024-10-14,ALC
2,ES,100931749423,169099229,2024-10-14 12:57:37+00:00,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,2024-10-14 13:22:05+00:00,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,2024-10-14,ALC
3,ES,100931867858,177428955,2024-10-14 13:44:05+00:00,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,2024-10-14 14:26:28+00:00,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,2024-10-14,ALC
4,ES,100931917343,3548605,2024-10-14 14:03:19+00:00,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,2024-10-14 14:28:00+00:00,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,2024-10-14,ALC


In [12]:
X_test = data_test[train_columns].copy()
X_test.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43


In [13]:
X_test.isnull().sum().sum()

np.int64(0)

In [14]:
y_test = (data_test['delivery_timestamp'] - data_test['pickup_timestamp']).dt.total_seconds()
y_test = pd.Series(y_test, dtype=np.float64, name='pickup_to_delivery')
y_test.head()

0    319.68
1    811.65
2    614.70
3   1111.15
4    806.57
Name: pickup_to_delivery, dtype: float64

In [15]:
y_test.isnull().sum().sum()

np.int64(0)

In [16]:
print("Train datasets shapes: ", X_train.shape, y_train.shape)
print("Test datasets shapes: ", X_test.shape, y_test.shape)

Train datasets shapes:  (688815, 18) (688815,)
Test datasets shapes:  (354799, 18) (354799,)


## Baseline Models

### BaselineModel_sum

In [17]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:24:28.265212


In [18]:
model_bl_sum = BaselineModel_sum()
model_bl_sum.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_sum at 0x11d7eedb0>

In [19]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_sum_time = end - start
print(f"Time elapsed: {model_bl_sum_time}")

End time: 2025-08-27 17:24:28.719116
Time elapsed: 0:00:00.453904


In [20]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_sum.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 345.14675182584176
Diff: 25.46975182584174


In [21]:
results_dataset = model_bl_sum.test(X_test, y_test)
results_dataset.head(20)


Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,345.15,25.47
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,344.1,-467.55
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,862.35,247.65
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1425.92,314.76
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,266.35,-540.22
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2713.04,1607.68
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,364.35,73.8
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,1097.86,343.07
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,855.01,-621.79
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,916.85,294.82


### Evaluation pipeline

In [22]:
model_bl_sum_eval = model_bl_sum.evaluate(X_test, y_test)
print(f"Baseline Model (Sum) Evaluation: MAE: {model_bl_sum_eval[0]}, MSE: {model_bl_sum_eval[1]}, R2: {model_bl_sum_eval[2]}")

Baseline Model (Sum) Evaluation: MAE: 256.3865388687307, MSE: 179628.8794954057, R2: -0.13471163707838207


### BaselineModel_mean

In [23]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:09.989418


In [24]:
model_bl_mean = BaselineModel_mean()
model_bl_mean.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_mean at 0x11d7efb90>

In [25]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_mean_time = end - start
print(f"Time elapsed: {model_bl_mean_time}")

End time: 2025-08-27 17:25:10.199723
Time elapsed: 0:00:00.210305


In [26]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_mean.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 292.3004304172709
Diff: -27.376569582729132


In [27]:
results_dataset = model_bl_mean.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,292.3,-27.38
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,281.54,-530.11
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,659.83,45.13
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1207.59,96.44
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,203.8,-602.77
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2219.77,1114.41
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,298.11,7.55
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,840.03,85.24
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,724.09,-752.71
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,750.15,128.12


### Evaluation pipeline

In [28]:
model_bl_mean_eval = model_bl_mean.evaluate(X_test, y_test)
print(f"Baseline Model (Mean) Evaluation: MAE: {model_bl_mean_eval[0]}, MSE: {model_bl_mean_eval[1]}, R2: {model_bl_mean_eval[2]}")

Baseline Model (Mean) Evaluation: MAE: 239.81356923333743, MSE: 157993.39824654086, R2: 0.001959216827968735


## Linear Models

In [29]:
# Train on a small subset to check if the model is working
X_train_smaller = X_train.head(1000)
y_train_smaller = y_train.head(1000)

### Linear Model encoding dummy variables

In [30]:
model_linear_smaller = LinearModel(model_type='linear', encoding='dummy')
model_linear_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 18), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 25)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d7ee1b0>

In [31]:
y_test_0 = y_train_smaller.loc[0]
y_test_0_pred = model_linear_smaller.predict(X_train_smaller.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 564.308
Predicted delivery time: 782.3062544539571
Diff: 217.9982544539571


#### Train on 3 days of data

In [32]:
# We take the last week of the dataset to test the model
days_for_train = 3
days_for_test = 1
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Test date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-03 | Test date: 2024-10-20


In [33]:
data_train_3d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_3d = data_train_3d[train_columns].copy()
y_train_3d = y_train[X_train_3d.index]
y_train_3d = pd.Series(y_train_3d, name='pickup_to_delivery')
data_test_3d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '=', end_date)])
X_test_3d = data_test_3d[train_columns].copy()
y_test_3d = y_test[X_test_3d.index]
y_test_3d = pd.Series(y_test_3d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 3d shapes: ", X_train_3d.shape, y_train_3d.shape)
print("Test datasets 3d shapes: ", X_test_3d.shape, y_test_3d.shape)

Train datasets 3d shapes:  (132533, 18) (132533,)
Test datasets 3d shapes:  (56857, 18) (56857,)


In [34]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:48.362478


In [35]:
model_linear_3d = LinearModel(model_type='linear', encoding='dummy')
model_linear_3d.fit(X_train_3d, y_train_3d)

INFO:root:Train datasets shapes: X: (132533, 18), y: (132533,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (132533, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11cbb7b30>

In [36]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_3d_time = end - start
print(f"Time elapsed: {model_linear_3d_time}")

End time: 2025-08-27 17:25:48.744282
Time elapsed: 0:00:00.381804


In [37]:
model_linear_3d_eval = model_linear_3d.evaluate(X_test_3d, y_test_3d)
print(f"Linear Model Evaluation: MAE: {model_linear_3d_eval[0]}, MSE: {model_linear_3d_eval[1]}, R2: {model_linear_3d_eval[2]}")

Linear Model Evaluation: MAE: 4850.462204065004, MSE: 23788905.201602723, R2: -161.6035400083139


#### Train on a week of data

In [38]:
# We take the last week of the dataset to test the model
days_for_train = 7
days_for_test = 3
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-07 | Begin test date: 2024-10-18 | End date: 2024-10-20


In [39]:
data_train_7d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_7d = data_train_7d[train_columns].copy()
y_train_7d = y_train[X_train_7d.index]
y_train_7d = pd.Series(y_train_7d, name='pickup_to_delivery')
data_test_7d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
X_test_7d = data_test_7d[train_columns].copy()
y_test_7d = y_test[X_test_7d.index]
y_test_7d = pd.Series(y_test_7d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 7d shapes: ", X_train_7d.shape, y_train_7d.shape)
print("Test datasets 7d shapes: ", X_test_7d.shape, y_test_7d.shape)

Train datasets 7d shapes:  (350485, 18) (350485,)
Test datasets 7d shapes:  (171489, 18) (171489,)


In [40]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:49.246967


In [41]:
model_linear_7d = LinearModel(model_type='linear', encoding='dummy')
model_linear_7d.fit(X_train_7d, y_train_7d)

INFO:root:Train datasets shapes: X: (350485, 18), y: (350485,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (350485, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d8ad490>

In [42]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_7d_time = end - start
print(f"Time elapsed: {model_linear_7d_time}")

End time: 2025-08-27 17:25:49.940197
Time elapsed: 0:00:00.693230


In [43]:
model_linear_7d_eval = model_linear_7d.evaluate(X_test_7d, y_test_7d)
print(f"Linear Model Evaluation: MAE: {model_linear_7d_eval[0]}, MSE: {model_linear_7d_eval[1]}, R2: {model_linear_7d_eval[2]}")

Linear Model Evaluation: MAE: 300.4289466948376, MSE: 198128.49677930487, R2: -0.3521967613710395


#### Train on full data

In [44]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:50.222485


In [45]:
model_linear = LinearModel(model_type='linear', encoding='dummy')
model_linear.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11cc44530>

In [46]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_time = end - start
print(f"Time elapsed: {model_linear_time}")

End time: 2025-08-27 17:25:51.488528
Time elapsed: 0:00:01.266043


In [47]:
results_dataset = model_linear.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,420.65,100.98
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,408.47,-403.18
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,721.01,106.31
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,883.34,-227.82
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,394.27,-412.3
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1520.05,414.69
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,432.11,141.55
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,849.76,94.97
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,620.67,-856.13
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,685.94,63.91


In [48]:
model_linear_eval = model_linear.evaluate(X_test, y_test)
print(f"Linear Model Evaluation: MAE: {model_linear_eval[0]}, MSE: {model_linear_eval[1]}, R2: {model_linear_eval[2]}")

Linear Model Evaluation: MAE: 182.0412700940976, MSE: 114165.7164465233, R2: 0.2788177080925668


### LinearModel cyclical encoding

In [49]:
model_linear_cyclical_smaller = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 18), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 32)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d84eb10>

In [50]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_cyclical_smaller.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: -2139.862067198381
Diff: -2459.539067198381


#### Train on full data

In [51]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:52.542157


In [52]:
model_linear_cyclical = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d7eee70>

In [53]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_cyclical_time = end - start
print(f"Time elapsed: {model_linear_cyclical_time}")

End time: 2025-08-27 17:25:54.961655
Time elapsed: 0:00:02.419498


In [54]:
results_dataset = model_linear_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,379.98,60.3
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,365.9,-445.75
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,724.55,109.85
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,888.75,-222.41
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,397.78,-408.78
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1529.67,424.31
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,432.03,141.48
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,836.3,81.51
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,594.73,-882.07
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,653.6,31.57


In [55]:
model_linear_cyclical_eval = model_linear_cyclical.evaluate(X_test, y_test)
print(f"Linear Model (Cyclical) Evaluation: MAE: {model_linear_cyclical_eval[0]}, MSE: {model_linear_cyclical_eval[1]}, R2: {model_linear_cyclical_eval[2]}")

Linear Model (Cyclical) Evaluation: MAE: 183.8933876667445, MSE: 122851.23978315908, R2: 0.22395145032888386


### Linear Model with normalization

Compare performance of the model with and without normalizatio or standardization.

With the `minmax` option, the scaled data has zero mean and unit variance.

In [56]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:25:56.694506


In [57]:
model_linear_minmaxscaler = LinearModel(model_type='linear', encoding='dummy', standardize='minmax')
model_linear_minmaxscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d84e570>

In [58]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_minmaxscaler_time = end - start
print(f"Time elapsed: {model_linear_minmaxscaler_time}")

End time: 2025-08-27 17:25:58.565403
Time elapsed: 0:00:01.870897


In [59]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_minmaxscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 420.67509782687284
Diff: 100.99809782687282


In [60]:
results_dataset = model_linear_minmaxscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,420.68,101.0
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,408.23,-403.42
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,721.81,107.11
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,883.1,-228.05
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,394.18,-412.39
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1519.92,414.55
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,432.15,141.59
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,849.69,94.9
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,620.67,-856.13
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,685.77,63.74


In [61]:
model_linear_minmaxscaler_eval = model_linear_minmaxscaler.evaluate(X_test, y_test)
print(f"Linear Model (MinMaxScaler) Evaluation: MAE: {model_linear_minmaxscaler_eval[0]}, MSE: {model_linear_minmaxscaler_eval[1]}, R2: {model_linear_minmaxscaler_eval[2]}")

Linear Model (MinMaxScaler) Evaluation: MAE: 182.03595679938633, MSE: 114165.23142214997, R2: 0.27882077198074684


### Linear Model with standardization

With the `stdscaler` option, the scaled data is transformed to have values from 0 and 1.

In [62]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:26:00.153193


In [63]:
model_linear_stdscaler = LinearModel(model_type='linear', encoding='dummy', standardize='stdscaler')
model_linear_stdscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d84c4d0>

In [64]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_stdscaler_time = end - start
print(f"Time elapsed: {model_linear_stdscaler_time}")

End time: 2025-08-27 17:26:02.296678
Time elapsed: 0:00:02.143485


In [65]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_stdscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 419.6766356595232
Diff: 99.99963565952316


In [66]:
results_dataset = model_linear_stdscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,419.71,100.03
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,407.47,-404.18
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,719.1,104.4
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,881.32,-229.84
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,393.18,-413.39
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1517.07,411.7
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,431.02,140.46
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,847.79,93.0
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,619.22,-857.58
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,684.18,62.15


In [67]:
model_linear_stdscaler_eval = model_linear_stdscaler.evaluate(X_test, y_test)
print(f"Linear Model StandardScaler Evaluation: MAE: {model_linear_stdscaler_eval[0]}, MSE: {model_linear_stdscaler_eval[1]}, R2: {model_linear_stdscaler_eval[2]}")

Linear Model StandardScaler Evaluation: MAE: 182.07986354136003, MSE: 114166.148808296, R2: 0.27881497687286383


### Linear Model SGD encoding dummy variables

#### Train on full data

In [68]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 17:26:04.043154


In [69]:
model_linear_SGD_stdscaler = LinearModel(model_type='sgd', encoding='dummy', standardize='stdscaler')
model_linear_SGD_stdscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x11d84f380>

In [70]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_stdscaler_time = end - start
print(f"Time elapsed: {model_linear_SGD_stdscaler_time}")

End time: 2025-08-27 17:26:17.035567
Time elapsed: 0:00:12.992413


In [71]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_SGD_stdscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 449.02609161580585
Diff: 129.34909161580583


In [72]:
results_dataset = model_linear_SGD_stdscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,449.03,129.35
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,421.36,-390.28
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,730.67,115.97
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,865.08,-246.07
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,392.08,-414.49
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1545.65,440.29
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,457.69,167.13
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,828.41,73.62
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,633.79,-843.01
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,685.24,63.21


In [73]:
model_linear_SGD_stdscaler_eval = model_linear_SGD_stdscaler.evaluate(X_test, y_test)
print(f"Linear Model SGD & StandardScaler Evaluation: MAE: {model_linear_SGD_stdscaler_eval[0]}, MSE: {model_linear_SGD_stdscaler_eval[1]}, R2: {model_linear_SGD_stdscaler_eval[2]}")

Linear Model SGD & StandardScaler Evaluation: MAE: 182.29408420495903, MSE: 114987.55842078445, R2: 0.27362615061777085


## Tree Models

### Decision Tree

In [74]:
start = datetime.datetime.now()
model_decision_tree = RegressionTreeMethod(model_type = 'tree', encoding = 'dummy')
model_decision_tree.fit(X_train, y_train)
end = datetime.datetime.now()
model_decision_tree_time = end - start
model_decision_tree_eval = model_decision_tree.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_eval[0]}, MSE: {model_decision_tree_eval[1]}, R2: {model_decision_tree_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 307.18445949396715, MSE: 334059.0220369923, R2: -1.1102434132040218


In [75]:
#model_decision_tree.plot_feature_importance(X_test)

In [76]:
start = datetime.datetime.now()
model_decision_tree_cyclical = RegressionTreeMethod(model_type = 'tree', encoding = 'cyclical')
model_decision_tree_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_decision_tree_cyclical_time = end - start
print(f"Time elapsed: {model_decision_tree_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:27.653087


In [77]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_decision_tree_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 635.64
Diff: 315.96299999999997


In [78]:
results_dataset = model_decision_tree_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,635.64,315.96
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,323.08,-488.57
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,643.97,29.27
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1328.83,217.68
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,924.9,118.33
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,921.89,-183.48
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,337.02,46.46
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,552.51,-202.28
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,739.54,-737.26
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,507.04,-114.99


In [79]:
model_decision_tree_cyclical_eval = model_decision_tree_cyclical.evaluate(X_test, y_test)
print(f"Decision Tree Model (Cyclical) Evaluation: MAE: {model_decision_tree_cyclical_eval[0]}, MSE: {model_decision_tree_cyclical_eval[1]}, R2: {model_decision_tree_cyclical_eval[2]}")

Decision Tree Model (Cyclical) Evaluation: MAE: 315.3852834365373, MSE: 407345.8102699504, R2: -1.5731944246763363


### Check different tree parameters for a better training

In [80]:
model_decision_tree_5 = RegressionTreeMethod(model_type = 'tree', encoding = 'dummy', min_samples_leaf=5)
model_decision_tree_5.fit(X_train, y_train)
model_decision_tree_5_eval = model_decision_tree_5.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_5_eval[0]}, MSE: {model_decision_tree_5_eval[1]}, R2: {model_decision_tree_5_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 307.34979799266637, MSE: 317633.49067121645, R2: -1.006483696847173


In [81]:
model_decision_tree_10 = RegressionTreeMethod(model_type = 'tree', encoding = 'cyclical', max_depth=50, min_samples_leaf=5)
model_decision_tree_10.fit(X_train, y_train)
model_decision_tree_10_eval = model_decision_tree_10.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_10_eval[0]}, MSE: {model_decision_tree_10_eval[1]}, R2: {model_decision_tree_10_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 319.2986037615664, MSE: 476675.3340539908, R2: -2.011147484628843


In [82]:
model_decision_tree.model.get_depth(), model_decision_tree_5.model.get_depth(), model_decision_tree_10.model.get_depth()

(60, 60, 61)

### Random Forest

In [83]:
start = datetime.datetime.now()
model_random_forest = RegressionTreeMethod(model_type = 'randomforest', encoding = 'dummy', n_estimators = 100)
model_random_forest.fit(X_train, y_train)
end = datetime.datetime.now()
model_random_forest_time = end - start
model_random_forest_eval = model_random_forest.evaluate(X_test, y_test)
print(f"Random Forest Model Evaluation: MAE: {model_random_forest_eval[0]}, MSE: {model_random_forest_eval[1]}, R2: {model_random_forest_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Random Forest Model Evaluation: MAE: 176.00919993739058, MSE: 108745.40041718313, R2: 0.3130577239096899


In [84]:
start = datetime.datetime.now()
model_random_forest_cyclical = RegressionTreeMethod(model_type = 'randomforest', encoding = 'cyclical', n_estimators = 100)
model_random_forest_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_random_forest_cyclical_time = end - start
print(f"Time elapsed: {model_random_forest_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:25:28.365826


In [85]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_random_forest_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 438.89101077252747
Diff: 119.21401077252744


In [86]:
#model_random_forest_cyclical.plot_feature_importance(X_test)

In [87]:
results_dataset = model_random_forest_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,438.89,119.21
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,467.24,-344.41
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,1161.13,546.43
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1030.38,-80.77
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,480.92,-325.64
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1227.51,122.14
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,449.64,159.08
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,989.13,234.34
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,912.48,-564.32
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,571.1,-50.93


In [88]:
model_random_forest_cyclical_eval = model_random_forest_cyclical.evaluate(X_test, y_test)
print(f"Random Forest Model (Cyclical) Evaluation: MAE: {model_random_forest_cyclical_eval[0]}, MSE: {model_random_forest_cyclical_eval[1]}, R2: {model_random_forest_cyclical_eval[2]}")

Random Forest Model (Cyclical) Evaluation: MAE: 176.37317458763113, MSE: 109028.49319875834, R2: 0.3112694331040653


### Gradient Boosting

In [89]:
start = datetime.datetime.now()
model_gradient_boosting = RegressionTreeMethod(model_type = 'gradientboosting', encoding = 'dummy', n_estimators = 100)
model_gradient_boosting.fit(X_train, y_train)
end = datetime.datetime.now()
model_gradient_boosting_time = end - start
model_gradient_boosting_eval = model_gradient_boosting.evaluate(X_test, y_test)
print(f"Gradient Boosting Model Evaluation: MAE: {model_gradient_boosting_eval[0]}, MSE: {model_gradient_boosting_eval[1]}, R2: {model_gradient_boosting_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Gradient Boosting Model Evaluation: MAE: 172.27876558692302, MSE: 108658.14138616895, R2: 0.31360893726808814


In [90]:
start = datetime.datetime.now()
model_gradient_boosting_cyclical = RegressionTreeMethod(model_type = 'gradientboosting', encoding = 'cyclical', n_estimators = 100)
model_gradient_boosting_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_gradient_boosting_cyclical_time = end - start
print(f"Time elapsed: {model_gradient_boosting_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:06:23.552123


In [91]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_gradient_boosting_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 507.67944289709357
Diff: 188.00244289709354


In [92]:
results_dataset = model_gradient_boosting_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,507.68,188.0
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,447.03,-364.62
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,786.28,171.58
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1026.04,-85.11
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,483.6,-322.96
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1266.77,161.4
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,503.9,213.35
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,871.33,116.54
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,738.78,-738.02
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,616.05,-5.98


In [93]:
model_gradient_boosting_cyclical_eval = model_gradient_boosting_cyclical.evaluate(X_test, y_test)
print(f"Gradient Boosting Model (Cyclical) Evaluation: MAE: {model_gradient_boosting_cyclical_eval[0]}, MSE: {model_gradient_boosting_cyclical_eval[1]}, R2: {model_gradient_boosting_cyclical_eval[2]}")

Gradient Boosting Model (Cyclical) Evaluation: MAE: 171.96872845554776, MSE: 108338.17495975613, R2: 0.31563015806813377


### Hist Gradient Boosting

In [94]:
start = datetime.datetime.now()
model_hist_gradient_boosting = RegressionTreeMethod(model_type = 'histgradientboosting', encoding = 'dummy', max_iter = 100)
model_hist_gradient_boosting.fit(X_train, y_train)
end = datetime.datetime.now()
model_hist_gradient_boosting_time = end - start
model_hist_gradient_boosting_eval = model_hist_gradient_boosting.evaluate(X_test, y_test)
print(f"Histogram-based Gradient Boosting Model Evaluation: MAE: {model_hist_gradient_boosting_eval[0]}, MSE: {model_hist_gradient_boosting_eval[1]}, R2: {model_hist_gradient_boosting_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Histogram-based Gradient Boosting Model Evaluation: MAE: 169.80459183125302, MSE: 107121.5132879574, R2: 0.323315782792039


In [95]:
start = datetime.datetime.now()
model_hist_gradient_boosting_cyclical = RegressionTreeMethod(model_type = 'histgradientboosting', encoding = 'cyclical', max_iter = 100)
model_hist_gradient_boosting_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_hist_gradient_boosting_cyclical_time = end - start
print(f"Time elapsed: {model_hist_gradient_boosting_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:07.379329


In [96]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_hist_gradient_boosting_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 484.2198944215369
Diff: 164.54289442153686


In [97]:
results_dataset = model_hist_gradient_boosting_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,484.22,164.54
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,442.05,-369.6
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,719.73,105.03
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1033.74,-77.42
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,470.62,-335.95
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1287.71,182.34
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,525.59,235.03
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,846.82,92.03
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,791.69,-685.12
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,604.7,-17.33


In [98]:
model_hist_gradient_boosting_cyclical_eval = model_hist_gradient_boosting_cyclical.evaluate(X_test, y_test)
print(f"Histogram-based Gradient Boosting Model (Cyclical) Evaluation: MAE: {model_hist_gradient_boosting_cyclical_eval[0]}, MSE: {model_hist_gradient_boosting_cyclical_eval[1]}, R2: {model_hist_gradient_boosting_cyclical_eval[2]}")

Histogram-based Gradient Boosting Model (Cyclical) Evaluation: MAE: 169.75857192164725, MSE: 107108.95961638018, R2: 0.3233950840562162


### XGBoost

In [99]:
start = datetime.datetime.now()
model_xgboost_cyclical = RegressionTreeMethod(model_type = 'xgboost', encoding = 'cyclical', max_iter = 100)
model_xgboost_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_xgboost_cyclical_time = end - start
print(f"Time elapsed: {model_xgboost_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:04.298468


In [100]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_xgboost_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 468.0061340332031
Diff: 148.3291340332031


In [101]:
results_dataset = model_xgboost_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,468.01,148.33
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,457.76,-353.89
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,693.46,78.76
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1061.96,-49.2
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,595.05,-211.51
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1188.57,83.2
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,430.51,139.95
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,801.09,46.3
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,813.98,-662.82
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,527.6,-94.43


In [102]:
model_xgboost_cyclical_eval = model_xgboost_cyclical.evaluate(X_test, y_test)
print(f"XGBoost Model (Cyclical) Evaluation: MAE: {model_xgboost_cyclical_eval[0]}, MSE: {model_xgboost_cyclical_eval[1]}, R2: {model_xgboost_cyclical_eval[2]}")

XGBoost Model (Cyclical) Evaluation: MAE: 173.0681940397628, MSE: 119798.20588628371, R2: 0.24323739756025797


### LightGBM

In [103]:
start = datetime.datetime.now()
model_lightgbm_cyclical = RegressionTreeMethod(model_type = 'lightgbm', encoding = 'cyclical', max_iter = 100)
model_lightgbm_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_lightgbm_cyclical_time = end - start
print(f"Time elapsed: {model_lightgbm_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3607
[LightGBM] [Info] Number of data points in the train set: 688815, number of used features: 31
[LightGBM] [Info] Start training from score 634.533801


INFO:root:Finished training the model


Time elapsed: 0:00:03.366575


In [104]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_lightgbm_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 492.6770805335888
Diff: 173.0000805335888


In [105]:
results_dataset = model_lightgbm_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,492.68,173.0
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,434.17,-377.48
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,689.61,74.91
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,956.55,-154.6
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,504.12,-302.44
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1256.78,151.41
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,485.25,194.69
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,835.2,80.41
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,752.87,-723.93
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,600.76,-21.27


In [106]:
model_lightgbm_cyclical_eval = model_lightgbm_cyclical.evaluate(X_test, y_test)
print(f"LightGBM Model (Cyclical) Evaluation: MAE: {model_lightgbm_cyclical_eval[0]}, MSE: {model_lightgbm_cyclical_eval[1]}, R2: {model_lightgbm_cyclical_eval[2]}")

LightGBM Model (Cyclical) Evaluation: MAE: 166.86057519297412, MSE: 106484.46410690958, R2: 0.32734000830163856


### CatBoost

In [107]:
start = datetime.datetime.now()
model_catboost_cyclical = RegressionTreeMethod(model_type = 'catboost', encoding = 'cyclical', max_iter = 100)
model_catboost_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_catboost_cyclical_time = end - start
print(f"Time elapsed: {model_catboost_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:05.607861


In [108]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_catboost_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 463.73797033780795
Diff: 144.06097033780793


In [109]:
results_dataset = model_catboost_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,463.74,144.06
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,415.77,-395.87
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,722.98,108.28
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1061.89,-49.26
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,485.75,-320.81
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1193.47,88.11
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,456.77,166.21
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,821.22,66.43
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,837.34,-639.46
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,580.18,-41.85


In [110]:
model_catboost_cyclical_eval = model_catboost_cyclical.evaluate(X_test, y_test)
print(f"CatBoost Model (Cyclical) Evaluation: MAE: {model_catboost_cyclical_eval[0]}, MSE: {model_catboost_cyclical_eval[1]}, R2: {model_catboost_cyclical_eval[2]}")

CatBoost Model (Cyclical) Evaluation: MAE: 168.78190118332205, MSE: 108256.70451898566, R2: 0.31614480503069264


## Models comparison

In [111]:
models = ['model_bl_sum' , 'model_bl_mean', 'model_linear_3d', 'model_linear_7d', 'model_linear', 'model_linear_cyclical',
          'model_linear_minmaxscaler', 'model_linear_stdscaler', 'model_linear_SGD_stdscaler', 'model_decision_tree',
          'model_decision_tree_cyclical', 'model_random_forest', 'model_random_forest_cyclical', 'model_gradient_boosting',
          'model_gradient_boosting_cyclical', 'model_hist_gradient_boosting', 'model_hist_gradient_boosting_cyclical',
          'model_xgboost_cyclical', 'model_lightgbm_cyclical', 'model_catboost_cyclical']

data = []
for model in models:
    time_value = eval(model + '_time')
    eval_value = eval(model + '_eval')
    data.append([model, time_value, eval_value[0], eval_value[1], eval_value[2]])

models_eval = pd.DataFrame(data, columns=['Model', 'Training time', 'MAE', 'MSE', 'R2'])
models_eval['Training time'] = models_eval['Training time'].apply(
    lambda x: (datetime.datetime.min + x).strftime('%H:%M.%f')
)
models_eval

Unnamed: 0,Model,Training time,MAE,MSE,R2
0,model_bl_sum,00:00.453904,256.39,179628.88,-0.13
1,model_bl_mean,00:00.210305,239.81,157993.4,0.0
2,model_linear_3d,00:00.381804,4850.46,23788905.2,-161.6
3,model_linear_7d,00:00.693230,300.43,198128.5,-0.35
4,model_linear,00:00.266043,182.04,114165.72,0.28
5,model_linear_cyclical,00:00.419498,183.89,122851.24,0.22
6,model_linear_minmaxscaler,00:00.870897,182.04,114165.23,0.28
7,model_linear_stdscaler,00:00.143485,182.08,114166.15,0.28
8,model_linear_SGD_stdscaler,00:00.992413,182.29,114987.56,0.27
9,model_decision_tree,00:00.727999,307.18,334059.02,-1.11
