# Pickup to Delivery Overall

In [1]:
import os
import sys
import datetime
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import numpy as np

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils
from estimator import BaselineModel_sum, BaselineModel_mean, LinearModel, RegressionTreeMethod

In [2]:
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']
cities = parameters_config['cities']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code} | Cities: {cities}')

Start date: 2024-09-30 | End date: 2024-10-20 | Countries: ES | Cities: 'MAD', 'BCN', 'SEV', 'ALC'


## Load the dataset

In [3]:
data = pd.read_parquet("data/parquet/dataframe.parquet")

## Hyperparameters

In [4]:
test_set_perc = 0.1
days_for_test = 7
k_cv = 5

## Database split

As we have partitioned the data by city and creation date, we can use this information to split the data. This will help to avoid data leakage, as we will not have data from the future in the training set.
This is much better than just sorting the data by the creation timestamp and taking 10% of the dataset as test set, as we did before.

In [5]:
# We take the last week of the dataset to test the model
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | Begin test date: 2024-10-14 | End date: 2024-10-20


In [6]:
data_train = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', begin_test_date)])
data_train.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100901465337,169809943,2024-09-30 21:45:07+00:00,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,...,2024-09-30 22:10:01+00:00,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9,2024-09-30,ALC
1,ES,100901489101,9516729,2024-09-30 21:55:46+00:00,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,...,2024-09-30 22:09:18+00:00,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52,2024-09-30,ALC
2,ES,100899913152,172855743,2024-09-30 10:30:55+00:00,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,...,2024-09-30 10:49:56+00:00,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46,2024-09-30,ALC
3,ES,100900447439,170201413,2024-09-30 14:46:15+00:00,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,...,2024-09-30 15:11:36+00:00,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04,2024-09-30,ALC
4,ES,100900529830,176424631,2024-09-30 15:22:32+00:00,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,...,2024-09-30 15:56:07+00:00,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24,2024-09-30,ALC


In [7]:
# Check that there are no nulls deriving from a wrong writing of parquet files (appending instead of overwriting)
data_train.isnull().sum().sum()

np.int64(0)

We will only use the feature `activation_timestamp`, as we are simulating being Glovo Jarvis engine that has to estimate the PDO time in order to decide to assign an order to a specific courier. The column `creation_timestamp` is threfore redundant and we will not include it in the model.

In [8]:
train_columns = ['country_code', 'city_code', 'order_id', 'courier_id', 'activation_timestamp', 'transport', 'pickup_latitude', 'pickup_longitude',
                 'delivery_latitude', 'delivery_longitude', 'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad', 'delivery_latitude_rad',
                 'delivery_longitude_rad', 'pd_distance_haversine_m', 'pd_distance_haversine_m_sk', 'pd_distance_manhattan_m']
X_train = data_train[train_columns].copy()
X_train.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100901465337,169809943,2024-09-30 21:45:08+00:00,MOTORBIKE,38.36,-0.49,38.38,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2355.55,2355.55,2561.9
1,ES,ALC,100901489101,9516729,2024-09-30 21:55:47+00:00,CAR,38.37,-0.42,38.36,-0.42,Europe/Madrid,0.67,-0.01,0.67,-0.01,673.66,673.66,903.52
2,ES,ALC,100899913152,172855743,2024-09-30 10:30:57+00:00,MOTORBIKE,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,614.69,614.69,622.46
3,ES,ALC,100900447439,170201413,2024-09-30 14:46:16+00:00,CAR,38.37,-0.47,38.36,-0.44,Europe/Madrid,0.67,-0.01,0.67,-0.01,2962.48,2962.48,3332.04
4,ES,ALC,100900529830,176424631,2024-09-30 15:22:33+00:00,MOTORBIKE,38.35,-0.48,38.35,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,1708.76,1708.76,1711.24


In [9]:
y_train = (data_train['delivery_timestamp'] - data_train['pickup_timestamp']).dt.total_seconds()
y_train = pd.Series(y_train, name='pickup_to_delivery')
y_train.head()

0   564.31
1   464.86
2   511.03
3   906.38
4   680.04
Name: pickup_to_delivery, dtype: float64

In [10]:
y_train.isnull().sum().sum()

np.int64(0)

In [11]:
data_test = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
data_test.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,delivery_timestamp,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100933187613,176241347,2024-10-14 23:24:53+00:00,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,2024-10-14 23:45:58+00:00,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,2024-10-14,ALC
1,ES,100933198445,10191824,2024-10-14 23:37:45+00:00,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,2024-10-15 00:04:30+00:00,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,2024-10-14,ALC
2,ES,100931749423,169099229,2024-10-14 12:57:37+00:00,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,2024-10-14 13:22:05+00:00,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,2024-10-14,ALC
3,ES,100931867858,177428955,2024-10-14 13:44:05+00:00,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,2024-10-14 14:26:28+00:00,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,2024-10-14,ALC
4,ES,100931917343,3548605,2024-10-14 14:03:19+00:00,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,2024-10-14 14:28:00+00:00,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,2024-10-14,ALC


In [12]:
X_test = data_test[train_columns].copy()
X_test.head()

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,time_zone,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,Europe/Madrid,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,Europe/Madrid,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43


In [13]:
X_test.isnull().sum().sum()

np.int64(0)

In [14]:
y_test = (data_test['delivery_timestamp'] - data_test['pickup_timestamp']).dt.total_seconds()
y_test = pd.Series(y_test, dtype=np.float64, name='pickup_to_delivery')
y_test.head()

0    319.68
1    811.65
2    614.70
3   1111.15
4    806.57
Name: pickup_to_delivery, dtype: float64

In [15]:
y_test.isnull().sum().sum()

np.int64(0)

In [16]:
print("Train datasets shapes: ", X_train.shape, y_train.shape)
print("Test datasets shapes: ", X_test.shape, y_test.shape)

Train datasets shapes:  (688815, 18) (688815,)
Test datasets shapes:  (354799, 18) (354799,)


## Baseline Models

### BaselineModel_sum

In [17]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:17:49.965162


In [18]:
model_bl_sum = BaselineModel_sum()
model_bl_sum.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_sum at 0x1286975c0>

In [19]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_sum_time = end - start
print(f"Time elapsed: {model_bl_sum_time}")

End time: 2025-08-27 15:17:50.313045
Time elapsed: 0:00:00.347883


In [20]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_sum.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 345.14675182584176
Diff: 25.46975182584174


In [21]:
results_dataset = model_bl_sum.test(X_test, y_test)
results_dataset.head(20)


Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,345.15,25.47
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,344.1,-467.55
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,862.35,247.65
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1425.92,314.76
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,266.35,-540.22
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2713.04,1607.68
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,364.35,73.8
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,1097.86,343.07
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,855.01,-621.79
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,916.85,294.82


### Evaluation pipeline

In [22]:
model_bl_sum_eval = model_bl_sum.evaluate(X_test, y_test)
print(f"Baseline Model (Sum) Evaluation: MAE: {model_bl_sum_eval[0]}, MSE: {model_bl_sum_eval[1]}, R2: {model_bl_sum_eval[2]}")

Baseline Model (Sum) Evaluation: MAE: 256.3865388687307, MSE: 179628.8794954057, R2: -0.13471163707838207


### BaselineModel_mean

In [23]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:18:27.284563


In [24]:
model_bl_mean = BaselineModel_mean()
model_bl_mean.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 18), y: (688815,)


<estimator.BaselineModel_mean at 0x11702d190>

In [25]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_bl_mean_time = end - start
print(f"Time elapsed: {model_bl_mean_time}")

End time: 2025-08-27 15:18:27.505456
Time elapsed: 0:00:00.220893


In [26]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_bl_mean.predict(X_test.iloc[0]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 292.3004304172709
Diff: -27.376569582729132


In [27]:
results_dataset = model_bl_mean.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,292.3,-27.38
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,281.54,-530.11
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,659.83,45.13
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,1207.59,96.44
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,203.8,-602.77
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,2219.77,1114.41
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,298.11,7.55
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,840.03,85.24
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,724.09,-752.71
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,750.15,128.12


### Evaluation pipeline

In [28]:
model_bl_mean_eval = model_bl_mean.evaluate(X_test, y_test)
print(f"Baseline Model (Mean) Evaluation: MAE: {model_bl_mean_eval[0]}, MSE: {model_bl_mean_eval[1]}, R2: {model_bl_mean_eval[2]}")

Baseline Model (Mean) Evaluation: MAE: 239.81356923333743, MSE: 157993.39824654086, R2: 0.001959216827968735


## Linear Models

In [29]:
# Train on a small subset to check if the model is working
X_train_smaller = X_train.head(1000)
y_train_smaller = y_train.head(1000)

### Linear Model encoding dummy variables

In [30]:
model_linear_smaller = LinearModel(model_type='linear', encoding='dummy')
model_linear_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 19), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12876d220>

In [31]:
y_test_0 = y_train_smaller.loc[0]
y_test_0_pred = model_linear_smaller.predict(X_train_smaller.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 564.308
Predicted delivery time: 781.9825627692044
Diff: 217.67456276920439


#### Train on 3 days of data

In [32]:
# We take the last week of the dataset to test the model
days_for_train = 3
days_for_test = 1
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Test date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-03 | Test date: 2024-10-20


In [33]:
data_train_3d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_3d = data_train_3d[train_columns].copy()
y_train_3d = y_train[X_train_3d.index]
y_train_3d = pd.Series(y_train_3d, name='pickup_to_delivery')
data_test_3d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '=', end_date)])
X_test_3d = data_test_3d[train_columns].copy()
y_test_3d = y_test[X_test_3d.index]
y_test_3d = pd.Series(y_test_3d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 3d shapes: ", X_train_3d.shape, y_train_3d.shape)
print("Test datasets 3d shapes: ", X_test_3d.shape, y_test_3d.shape)

Train datasets 3d shapes:  (132533, 18) (132533,)
Test datasets 3d shapes:  (56857, 18) (56857,)


In [34]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:00.110395


In [35]:
model_linear_3d = LinearModel(model_type='linear', encoding='dummy')
model_linear_3d.fit(X_train_3d, y_train_3d)

INFO:root:Train datasets shapes: X: (132533, 18), y: (132533,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (132533, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x126035eb0>

In [36]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_3d_time = end - start
print(f"Time elapsed: {model_linear_3d_time}")

End time: 2025-08-27 15:19:00.397537
Time elapsed: 0:00:00.287142


In [37]:
model_linear_3d_eval = model_linear_3d.evaluate(X_test_3d, y_test_3d)
print(f"Linear Model Evaluation: MAE: {model_linear_3d_eval[0]}, MSE: {model_linear_3d_eval[1]}, R2: {model_linear_3d_eval[2]}")

Linear Model Evaluation: MAE: 4850.462204065004, MSE: 23788905.201602723, R2: -161.6035400083139


#### Train on a week of data

In [38]:
# We take the last week of the dataset to test the model
days_for_train = 7
days_for_test = 3
end_train_date = pd.to_datetime(start_date) + pd.Timedelta(days=days_for_train)
end_train_date = end_train_date.strftime("%Y-%m-%d")
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | End test date: {end_train_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | End test date: 2024-10-07 | Begin test date: 2024-10-18 | End date: 2024-10-20


In [39]:
data_train_7d = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', end_train_date)])
X_train_7d = data_train_7d[train_columns].copy()
y_train_7d = y_train[X_train_7d.index]
y_train_7d = pd.Series(y_train_7d, name='pickup_to_delivery')
data_test_7d = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
X_test_7d = data_test_7d[train_columns].copy()
y_test_7d = y_test[X_test_7d.index]
y_test_7d = pd.Series(y_test_7d, dtype=np.float64, name='pickup_to_delivery')
print("Train datasets 7d shapes: ", X_train_7d.shape, y_train_7d.shape)
print("Test datasets 7d shapes: ", X_test_7d.shape, y_test_7d.shape)

Train datasets 7d shapes:  (350485, 18) (350485,)
Test datasets 7d shapes:  (171489, 18) (171489,)


In [40]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:00.817109


In [41]:
model_linear_7d = LinearModel(model_type='linear', encoding='dummy')
model_linear_7d.fit(X_train_7d, y_train_7d)

INFO:root:Train datasets shapes: X: (350485, 18), y: (350485,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (350485, 26)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x128700710>

In [42]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_7d_time = end - start
print(f"Time elapsed: {model_linear_7d_time}")

End time: 2025-08-27 15:19:01.442913
Time elapsed: 0:00:00.625804


In [43]:
model_linear_7d_eval = model_linear_7d.evaluate(X_test_7d, y_test_7d)
print(f"Linear Model Evaluation: MAE: {model_linear_7d_eval[0]}, MSE: {model_linear_7d_eval[1]}, R2: {model_linear_7d_eval[2]}")

Linear Model Evaluation: MAE: 300.42894669473145, MSE: 198128.49677914605, R2: -0.3521967613699557


#### Train on full data

In [44]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:01.617323


In [45]:
model_linear = LinearModel(model_type='linear', encoding='dummy')
model_linear.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x128701340>

In [46]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_time = end - start
print(f"Time elapsed: {model_linear_time}")

End time: 2025-08-27 15:19:02.833254
Time elapsed: 0:00:01.215931


In [47]:
results_dataset = model_linear.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,425.07,105.39
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,413.37,-398.27
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,731.36,116.66
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,894.0,-217.16
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,399.77,-406.79
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1540.69,435.33
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.36,146.8
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,861.22,106.43
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,628.16,-848.64
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,694.49,72.46


In [48]:
model_linear_eval = model_linear.evaluate(X_test, y_test)
print(f"Linear Model Evaluation: MAE: {model_linear_eval[0]}, MSE: {model_linear_eval[1]}, R2: {model_linear_eval[2]}")

Linear Model Evaluation: MAE: 183.02056888918506, MSE: 114183.02259705764, R2: 0.27870838552450594


### LinearModel cyclical encoding

In [49]:
model_linear_cyclical_smaller = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical_smaller.fit(X_train_smaller, y_train_smaller)

INFO:root:Train datasets shapes: X: (1000, 19), y: (1000,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (1000, 33)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12876c4a0>

In [50]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_cyclical_smaller.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: -2221.862971879542
Diff: -2541.539971879542


#### Train on full data

In [51]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:03.677017


In [52]:
model_linear_cyclical = LinearModel(model_type='linear', encoding='cyclical')
model_linear_cyclical.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x1287035c0>

In [53]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_cyclical_time = end - start
print(f"Time elapsed: {model_linear_cyclical_time}")

End time: 2025-08-27 15:19:05.932862
Time elapsed: 0:00:02.255845


In [54]:
results_dataset = model_linear_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,384.36,64.68
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,370.8,-440.85
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,734.68,119.98
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,899.01,-212.15
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,402.98,-403.58
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1550.03,444.67
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.06,146.5
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,847.54,92.75
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,602.05,-874.75
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,661.98,39.94


In [55]:
model_linear_cyclical_eval = model_linear_cyclical.evaluate(X_test, y_test)
print(f"Linear Model (Cyclical) Evaluation: MAE: {model_linear_cyclical_eval[0]}, MSE: {model_linear_cyclical_eval[1]}, R2: {model_linear_cyclical_eval[2]}")

Linear Model (Cyclical) Evaluation: MAE: 183.2864817461546, MSE: 122189.95765099523, R2: 0.22812875485177675


### Linear Model with normalization

Compare performance of the model with and without normalizatio or standardization.

With the `minmax` option, the scaled data has zero mean and unit variance.

In [56]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:07.452373


In [57]:
model_linear_minmaxscaler = LinearModel(model_type='linear', encoding='dummy', standardize='minmax')
model_linear_minmaxscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x12779b770>

In [58]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_minmaxscaler_time = end - start
print(f"Time elapsed: {model_linear_minmaxscaler_time}")

End time: 2025-08-27 15:19:09.267578
Time elapsed: 0:00:01.815205


In [59]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_minmaxscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 425.0748399533377
Diff: 105.39783995333767


In [60]:
results_dataset = model_linear_minmaxscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,425.08,105.4
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,413.21,-398.44
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,732.18,117.48
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,893.77,-217.39
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,399.71,-406.85
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1540.53,435.16
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.41,146.86
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,861.13,106.34
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,628.16,-848.64
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,694.27,72.24


In [61]:
model_linear_minmaxscaler_eval = model_linear_minmaxscaler.evaluate(X_test, y_test)
print(f"Linear Model (MinMaxScaler) Evaluation: MAE: {model_linear_minmaxscaler_eval[0]}, MSE: {model_linear_minmaxscaler_eval[1]}, R2: {model_linear_minmaxscaler_eval[2]}")

Linear Model (MinMaxScaler) Evaluation: MAE: 182.9974746869479, MSE: 114180.81536516675, R2: 0.2787223285592749


### Linear Model with standardization

With the `stdscaler` option, the scaled data is transformed to have values from 0 and 1.

In [62]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:10.759326


In [63]:
model_linear_stdscaler = LinearModel(model_type='linear', encoding='dummy', standardize='stdscaler')
model_linear_stdscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x128700b90>

In [64]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_stdscaler_time = end - start
print(f"Time elapsed: {model_linear_stdscaler_time}")

End time: 2025-08-27 15:19:12.675171
Time elapsed: 0:00:01.915845


In [65]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_stdscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 423.915316660288
Diff: 104.23831666028798


In [66]:
results_dataset = model_linear_stdscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,423.95,104.27
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,413.23,-398.42
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,725.09,110.39
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,893.88,-217.27
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,401.49,-405.07
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1530.51,425.14
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,437.29,146.73
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,856.45,101.66
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,627.49,-849.32
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,689.57,67.54


In [67]:
model_linear_stdscaler_eval = model_linear_stdscaler.evaluate(X_test, y_test)
print(f"Linear Model StandardScaler Evaluation: MAE: {model_linear_stdscaler_eval[0]}, MSE: {model_linear_stdscaler_eval[1]}, R2: {model_linear_stdscaler_eval[2]}")

Linear Model StandardScaler Evaluation: MAE: 183.04983268667368, MSE: 114183.72156898087, R2: 0.27870397013441106


### Linear Model SGD encoding dummy variables

#### Train on full data

In [68]:
start = datetime.datetime.now()
print(f"Start time: {start}")

Start time: 2025-08-27 15:19:14.197511


In [69]:
model_linear_SGD_stdscaler = LinearModel(model_type='sgd', encoding='dummy', standardize='stdscaler')
model_linear_SGD_stdscaler.fit(X_train, y_train)

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel at 0x128700ad0>

In [70]:
end = datetime.datetime.now()
print(f"End time: {end}")
model_linear_SGD_stdscaler_time = end - start
print(f"Time elapsed: {model_linear_SGD_stdscaler_time}")

End time: 2025-08-27 15:19:17.507310
Time elapsed: 0:00:03.309799


In [71]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_linear_SGD_stdscaler.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 439.56070787869356
Diff: 119.88370787869354


In [72]:
results_dataset = model_linear_SGD_stdscaler.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,439.56,119.88
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,421.77,-389.88
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,738.17,123.47
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,882.81,-228.34
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,407.42,-399.14
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,1576.55,471.18
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,462.24,171.69
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,841.21,86.42
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,638.49,-838.31
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,704.24,82.21


In [73]:
model_linear_SGD_stdscaler_eval = model_linear_SGD_stdscaler.evaluate(X_test, y_test)
print(f"Linear Model SGD & StandardScaler Evaluation: MAE: {model_linear_SGD_stdscaler_eval[0]}, MSE: {model_linear_SGD_stdscaler_eval[1]}, R2: {model_linear_SGD_stdscaler_eval[2]}")

Linear Model SGD & StandardScaler Evaluation: MAE: 181.2668879636834, MSE: 115138.46976342073, R2: 0.2726728470223939


## Tree Models

### Decision Tree

In [74]:
start = datetime.datetime.now()
model_decision_tree = RegressionTreeMethod(model_type = 'tree', encoding = 'dummy')
model_decision_tree.fit(X_train, y_train)
end = datetime.datetime.now()
model_decision_tree_time = end - start
model_decision_tree_eval = model_decision_tree.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_eval[0]}, MSE: {model_decision_tree_eval[1]}, R2: {model_decision_tree_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 14599.823769725963, MSE: 232415338.35032222, R2: -1467.1625237681524


In [None]:
#model_decision_tree.plot_feature_importance(X_test)

In [77]:
start = datetime.datetime.now()
model_decision_tree_cyclical = RegressionTreeMethod(model_type = 'tree', encoding = 'cyclical')
model_decision_tree_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_decision_tree_cyclical_time = end - start
print(f"Time elapsed: {model_decision_tree_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:27.552254


In [78]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_decision_tree_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 10366.521
Diff: 10046.844000000001


In [79]:
results_dataset = model_decision_tree_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,10366.52,10046.84
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,13369.51,12557.86
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,16472.52,15857.82
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,16472.52,15361.37
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,13369.51,12562.94
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,16472.52,15367.16
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,13369.51,13078.95
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,15136.67,14381.88
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,16408.4,14931.6
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,16408.4,15786.37


In [80]:
model_decision_tree_cyclical_eval = model_decision_tree_cyclical.evaluate(X_test, y_test)
print(f"Decision Tree Model (Cyclical) Evaluation: MAE: {model_decision_tree_cyclical_eval[0]}, MSE: {model_decision_tree_cyclical_eval[1]}, R2: {model_decision_tree_cyclical_eval[2]}")

Decision Tree Model (Cyclical) Evaluation: MAE: 14281.917742657108, MSE: 223442941.65847287, R2: -1410.4840933992461


### Check different tree parameters for a better training

In [81]:
model_decision_tree_5 = RegressionTreeMethod(model_type = 'tree', encoding = 'dummy', min_samples_leaf=5)
model_decision_tree_5.fit(X_train, y_train)
model_decision_tree_5_eval = model_decision_tree_5.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_5_eval[0]}, MSE: {model_decision_tree_5_eval[1]}, R2: {model_decision_tree_5_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 29460.090400274516, MSE: 1400603815.2670574, R2: -8846.583153579482


In [82]:
model_decision_tree_10 = RegressionTreeMethod(model_type = 'tree', encoding = 'cyclical', max_depth=50, min_samples_leaf=5)
model_decision_tree_10.fit(X_train, y_train)
model_decision_tree_10_eval = model_decision_tree_10.evaluate(X_test, y_test)
print(f"Decision Tree Model Evaluation: MAE: {model_decision_tree_10_eval[0]}, MSE: {model_decision_tree_10_eval[1]}, R2: {model_decision_tree_10_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Decision Tree Model Evaluation: MAE: 29166.32534311258, MSE: 1392138723.502168, R2: -8793.109357152432


In [83]:
model_decision_tree.model.get_depth(), model_decision_tree_5.model.get_depth(), model_decision_tree_10.model.get_depth()

(33, 33, 32)

### Random Forest

In [84]:
start = datetime.datetime.now()
model_random_forest = RegressionTreeMethod(model_type = 'randomforest', encoding = 'dummy', n_estimators = 100)
model_random_forest.fit(X_train, y_train)
end = datetime.datetime.now()
model_random_forest_time = end - start
model_random_forest_eval = model_random_forest.evaluate(X_test, y_test)
print(f"Random Forest Model Evaluation: MAE: {model_random_forest_eval[0]}, MSE: {model_random_forest_eval[1]}, R2: {model_random_forest_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Random Forest Model Evaluation: MAE: 13891.913596862314, MSE: 210197201.70292026, R2: -1326.811048666681


In [85]:
start = datetime.datetime.now()
model_random_forest_cyclical = RegressionTreeMethod(model_type = 'randomforest', encoding = 'cyclical', n_estimators = 100)
model_random_forest_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_random_forest_cyclical_time = end - start
print(f"Time elapsed: {model_random_forest_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:24:29.332010


In [86]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_random_forest_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 11285.011351916592
Diff: 10965.334351916592


In [87]:
#model_random_forest_cyclical.plot_feature_importance(X_test)

In [88]:
results_dataset = model_random_forest_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,11285.01,10965.33
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,18950.49,18138.84
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,12418.75,11804.05
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,12236.16,11125.01
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,15475.33,14668.76
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,12354.14,11248.78
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,18901.57,18611.01
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,12483.0,11728.21
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,12240.32,10763.52
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,12591.27,11969.24


In [89]:
model_random_forest_cyclical_eval = model_random_forest_cyclical.evaluate(X_test, y_test)
print(f"Random Forest Model (Cyclical) Evaluation: MAE: {model_random_forest_cyclical_eval[0]}, MSE: {model_random_forest_cyclical_eval[1]}, R2: {model_random_forest_cyclical_eval[2]}")

Random Forest Model (Cyclical) Evaluation: MAE: 13373.132242692653, MSE: 192418974.50981337, R2: -1214.5063828506384


### Gradient Boosting

In [90]:
start = datetime.datetime.now()
model_gradient_boosting = RegressionTreeMethod(model_type = 'gradientboosting', encoding = 'dummy', n_estimators = 100)
model_gradient_boosting.fit(X_train, y_train)
end = datetime.datetime.now()
model_gradient_boosting_time = end - start
model_gradient_boosting_eval = model_gradient_boosting.evaluate(X_test, y_test)
print(f"Gradient Boosting Model Evaluation: MAE: {model_gradient_boosting_eval[0]}, MSE: {model_gradient_boosting_eval[1]}, R2: {model_gradient_boosting_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Gradient Boosting Model Evaluation: MAE: 51448.02134876383, MSE: 2893497531.238212, R2: -18277.159557508923


In [91]:
start = datetime.datetime.now()
model_gradient_boosting_cyclical = RegressionTreeMethod(model_type = 'gradientboosting', encoding = 'cyclical', n_estimators = 100)
model_gradient_boosting_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_gradient_boosting_cyclical_time = end - start
print(f"Time elapsed: {model_gradient_boosting_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:06:57.153938


In [92]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_gradient_boosting_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 46320.38409257429
Diff: 46000.70709257429


In [93]:
results_dataset = model_gradient_boosting_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,46320.38,46000.71
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,57989.86,57178.21
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,62610.82,61996.12
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,62872.41,61761.26
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,52906.97,52100.4
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,64422.67,63317.3
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,58376.97,58086.41
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,62677.72,61922.93
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,63248.91,61772.11
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,62401.01,61778.98


In [94]:
model_gradient_boosting_cyclical_eval = model_gradient_boosting_cyclical.evaluate(X_test, y_test)
print(f"Gradient Boosting Model (Cyclical) Evaluation: MAE: {model_gradient_boosting_cyclical_eval[0]}, MSE: {model_gradient_boosting_cyclical_eval[1]}, R2: {model_gradient_boosting_cyclical_eval[2]}")

Gradient Boosting Model (Cyclical) Evaluation: MAE: 51448.02134876384, MSE: 2893497531.238212, R2: -18277.159557508923


### Hist Gradient Boosting

In [95]:
start = datetime.datetime.now()
model_hist_gradient_boosting = RegressionTreeMethod(model_type = 'histgradientboosting', encoding = 'dummy', max_iter = 100)
model_hist_gradient_boosting.fit(X_train, y_train)
end = datetime.datetime.now()
model_hist_gradient_boosting_time = end - start
model_hist_gradient_boosting_eval = model_hist_gradient_boosting.evaluate(X_test, y_test)
print(f"Histogram-based Gradient Boosting Model Evaluation: MAE: {model_hist_gradient_boosting_eval[0]}, MSE: {model_hist_gradient_boosting_eval[1]}, R2: {model_hist_gradient_boosting_eval[2]}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 27)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Histogram-based Gradient Boosting Model Evaluation: MAE: 8075.145436018622, MSE: 76700718.84996444, R2: -483.5167352589196


In [96]:
start = datetime.datetime.now()
model_hist_gradient_boosting_cyclical = RegressionTreeMethod(model_type = 'histgradientboosting', encoding = 'cyclical', max_iter = 100)
model_hist_gradient_boosting_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_hist_gradient_boosting_cyclical_time = end - start
print(f"Time elapsed: {model_hist_gradient_boosting_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:05.670781


In [97]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_hist_gradient_boosting_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 7158.347379183848
Diff: 6838.670379183848


In [98]:
results_dataset = model_hist_gradient_boosting_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,7158.35,6838.67
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,11129.27,10317.62
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,10434.58,9819.88
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,11681.36,10570.21
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,5451.04,4644.48
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,12420.67,11315.31
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,6833.37,6542.81
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,11964.71,11209.92
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,11497.82,10021.02
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,11071.65,10449.62


In [99]:
model_hist_gradient_boosting_cyclical_eval = model_hist_gradient_boosting_cyclical.evaluate(X_test, y_test)
print(f"Histogram-based Gradient Boosting Model (Cyclical) Evaluation: MAE: {model_hist_gradient_boosting_cyclical_eval[0]}, MSE: {model_hist_gradient_boosting_cyclical_eval[1]}, R2: {model_hist_gradient_boosting_cyclical_eval[2]}")

Histogram-based Gradient Boosting Model (Cyclical) Evaluation: MAE: 7980.491271171009, MSE: 74637398.92424639, R2: -470.48278917349705


### XGBoost

In [100]:
start = datetime.datetime.now()
model_xgboost_cyclical = RegressionTreeMethod(model_type = 'xgboost', encoding = 'cyclical', max_iter = 100)
model_xgboost_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_xgboost_cyclical_time = end - start
print(f"Time elapsed: {model_xgboost_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:03.088827


In [101]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_xgboost_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 6139.6513671875
Diff: 5819.9743671875


In [102]:
results_dataset = model_xgboost_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,6139.65,5819.97
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,11898.71,11087.06
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,10228.78,9614.08
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,11304.26,10193.1
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,5600.12,4793.56
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,10499.91,9394.54
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,5118.2,4827.64
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,9636.61,8881.82
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,10977.43,9500.63
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,11266.28,10644.25


In [103]:
model_xgboost_cyclical_eval = model_xgboost_cyclical.evaluate(X_test, y_test)
print(f"XGBoost Model (Cyclical) Evaluation: MAE: {model_xgboost_cyclical_eval[0]}, MSE: {model_xgboost_cyclical_eval[1]}, R2: {model_xgboost_cyclical_eval[2]}")

XGBoost Model (Cyclical) Evaluation: MAE: 7581.639621732099, MSE: 70300910.51774134, R2: -443.0892883470188


### LightGBM

In [104]:
start = datetime.datetime.now()
model_lightgbm_cyclical = RegressionTreeMethod(model_type = 'lightgbm', encoding = 'cyclical', max_iter = 100)
model_lightgbm_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_lightgbm_cyclical_time = end - start
print(f"Time elapsed: {model_lightgbm_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3862
[LightGBM] [Info] Number of data points in the train set: 688815, number of used features: 32
[LightGBM] [Info] Start training from score 634.533801


INFO:root:Finished training the model


Time elapsed: 0:00:03.009659


In [105]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_lightgbm_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 7118.055607183256
Diff: 6798.3786071832565


In [106]:
results_dataset = model_lightgbm_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,7118.06,6798.38
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,11692.95,10881.3
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,11065.46,10450.76
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,11775.47,10664.32
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,5671.86,4865.29
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,12131.49,11026.12
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,6586.09,6295.53
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,11729.47,10974.68
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,11234.96,9758.16
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,11518.19,10896.15


In [107]:
model_lightgbm_cyclical_eval = model_lightgbm_cyclical.evaluate(X_test, y_test)
print(f"LightGBM Model (Cyclical) Evaluation: MAE: {model_lightgbm_cyclical_eval[0]}, MSE: {model_lightgbm_cyclical_eval[1]}, R2: {model_lightgbm_cyclical_eval[2]}")

LightGBM Model (Cyclical) Evaluation: MAE: 7933.527759529517, MSE: 74960322.45662566, R2: -472.5226899461709


### CatBoost

In [108]:
start = datetime.datetime.now()
model_catboost_cyclical = RegressionTreeMethod(model_type = 'catboost', encoding = 'cyclical', max_iter = 100)
model_catboost_cyclical.fit(X_train, y_train)
end = datetime.datetime.now()
model_catboost_cyclical_time = end - start
print(f"Time elapsed: {model_catboost_cyclical_time}")

INFO:root:Train datasets shapes: X: (688815, 19), y: (688815,)
INFO:root:Train datasets columns: Index(['country_code', 'city_code', 'order_id', 'courier_id',
       'activation_timestamp', 'transport', 'pickup_latitude',
       'pickup_longitude', 'delivery_latitude', 'delivery_longitude',
       'time_zone', 'pickup_latitude_rad', 'pickup_longitude_rad',
       'delivery_latitude_rad', 'delivery_longitude_rad',
       'pd_distance_haversine_m', 'pd_distance_haversine_m_sk',
       'pd_distance_manhattan_m', 'velocity'],
      dtype='object')
INFO:root:Starting to encode variables
INFO:root:Encoded dataset shape: X: (688815, 34)
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


Time elapsed: 0:00:04.699971


In [109]:
y_test_0 = y_test.loc[0]
y_test_0_pred = model_catboost_cyclical.predict(X_test.iloc[[0]]).item()
print(f"True delivery time: {y_test_0}")
print(f"Predicted delivery time: {y_test_0_pred}")
print(f"Diff: {y_test_0_pred - y_test_0}")

True delivery time: 319.677
Predicted delivery time: 5869.960738126631
Diff: 5550.283738126632


In [110]:
results_dataset = model_catboost_cyclical.test(X_test, y_test)
results_dataset.head(20)

Unnamed: 0,country_code,city_code,order_id,courier_id,activation_timestamp,transport,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,...,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,target,prediction,residual
0,ES,ALC,100933187613,176241347,2024-10-14 23:24:54+00:00,BICYCLE,38.35,-0.48,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,636.15,636.15,893.84,319.68,5869.96,5550.28
1,ES,ALC,100933198445,10191824,2024-10-14 23:37:46+00:00,MOTORBIKE,38.35,-0.48,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,774.11,774.11,1012.03,811.65,8006.46,7194.81
2,ES,ALC,100931749423,169099229,2024-10-14 12:57:38+00:00,CAR,38.36,-0.49,38.34,-0.49,...,0.67,-0.01,0.67,-0.01,2205.9,2205.9,2337.58,614.7,10261.76,9647.06
3,ES,ALC,100931867858,177428955,2024-10-14 13:44:06+00:00,BICYCLE,38.35,-0.47,38.34,-0.5,...,0.67,-0.01,0.67,-0.01,2628.16,2628.16,3677.11,1111.15,11187.16,10076.01
4,ES,ALC,100931917343,3548605,2024-10-14 14:03:20+00:00,CAR,38.35,-0.49,38.35,-0.49,...,0.67,-0.01,0.67,-0.01,681.33,681.33,941.43,806.57,5263.84,4457.27
5,ES,ALC,100931951627,173230053,2024-10-14 14:16:29+00:00,MOTORBIKE,38.36,-0.5,38.37,-0.43,...,0.67,-0.01,0.67,-0.01,6103.51,6103.51,6660.54,1105.37,10240.39,9135.03
6,ES,ALC,100932201108,141248791,2024-10-14 16:08:35+00:00,MOTORBIKE,38.37,-0.51,38.38,-0.5,...,0.67,-0.01,0.67,-0.01,819.69,819.69,1097.23,290.56,5726.51,5435.95
7,ES,ALC,100932473558,39980765,2024-10-14 18:16:45+00:00,CAR,38.35,-0.48,38.35,-0.52,...,0.67,-0.01,0.67,-0.01,2808.36,2808.36,2975.41,754.79,12118.01,11363.21
8,ES,ALC,100932857877,177428955,2024-10-14 20:36:34+00:00,BICYCLE,38.35,-0.47,38.34,-0.48,...,0.67,-0.01,0.67,-0.01,1575.89,1575.89,2230.19,1476.8,11248.8,9772.0
9,ES,ALC,100932926010,171303156,2024-10-14 21:00:05+00:00,MOTORBIKE,38.37,-0.41,38.39,-0.41,...,0.67,-0.01,0.67,-0.01,2062.63,2062.63,2099.82,622.03,11694.47,11072.44


In [111]:
model_catboost_cyclical_eval = model_catboost_cyclical.evaluate(X_test, y_test)
print(f"CatBoost Model (Cyclical) Evaluation: MAE: {model_catboost_cyclical_eval[0]}, MSE: {model_catboost_cyclical_eval[1]}, R2: {model_catboost_cyclical_eval[2]}")

CatBoost Model (Cyclical) Evaluation: MAE: 7304.367461388583, MSE: 65706769.41174268, R2: -414.0682011476737


## Models comparison

In [112]:
models = ['model_bl_sum' , 'model_bl_mean', 'model_linear_3d', 'model_linear_7d', 'model_linear', 'model_linear_cyclical',
          'model_linear_minmaxscaler', 'model_linear_stdscaler', 'model_linear_SGD_stdscaler', 'model_decision_tree',
          'model_decision_tree_cyclical', 'model_random_forest', 'model_random_forest_cyclical', 'model_gradient_boosting',
          'model_gradient_boosting_cyclical', 'model_hist_gradient_boosting', 'model_hist_gradient_boosting_cyclical',
          'model_xgboost_cyclical', 'model_lightgbm_cyclical', 'model_catboost_cyclical']

data = []
for model in models:
    time_value = eval(model + '_time')
    eval_value = eval(model + '_eval')
    data.append([model, time_value, eval_value[0], eval_value[1], eval_value[2]])

models_eval = pd.DataFrame(data, columns=['Model', 'Training time', 'MAE', 'MSE', 'R2'])
models_eval['Training time'] = models_eval['Training time'].apply(
    lambda x: (datetime.datetime.min + x).strftime('%H:%M.%f')
)
models_eval

Unnamed: 0,Model,Training time,MAE,MSE,R2
0,model_bl_sum,00:00.347883,256.39,179628.88,-0.13
1,model_bl_mean,00:00.220893,239.81,157993.4,0.0
2,model_linear_3d,00:00.287142,4850.46,23788905.2,-161.6
3,model_linear_7d,00:00.625804,300.43,198128.5,-0.35
4,model_linear,00:00.215931,183.02,114183.02,0.28
5,model_linear_cyclical,00:00.255845,183.29,122189.96,0.23
6,model_linear_minmaxscaler,00:00.815205,183.0,114180.82,0.28
7,model_linear_stdscaler,00:00.915845,183.05,114183.72,0.28
8,model_linear_SGD_stdscaler,00:00.309799,181.27,115138.47,0.27
9,model_decision_tree,00:00.718739,14599.82,232415338.35,-1467.16
