# Pickup to Delivery Overall

In [1]:
import os
import sys
import time
import shutil
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from haversine import haversine, Unit
from sklearn.metrics.pairwise import haversine_distances, manhattan_distances
from sklearn.model_selection import train_test_split, cross_val_score

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils
from estimator import BaselineModel_sum, BaselineModel_mean, LinearModel_encode_timestamps_dummy_variables, LinearModel_encode_timestamps_cyclical

In [2]:
base_query_path = './queries/'
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
datalake_connection = qr.create_connection(db='datalake')
#monolith_connection = qr.create_connection(user=livedb_config['monolith_username'], password=livedb_config['monolith_password'], db='livedb')
#dispatching_db_connection = qr.create_connection(user=livedb_config['dispatching_db_username'], password=livedb_config['dispatching_db_password'], db='dispatchingdb')

INFO:trino.auth:keyring module not found. OAuth2 token will not be stored in keyring.


In [3]:
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']
cities = parameters_config['cities']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code} | Cities: {cities}')

Start date: 2024-09-30 | End date: 2024-10-20 | Countries: ES | Cities: 'MAD', 'BCN', 'SEV', 'ALC'


In [4]:
parameters = {
    'start_date': start_date,
    'end_date': end_date,
    'country_code': country_code,
    'cities': cities
}

## Load the dataset

In [5]:
data = pd.read_parquet("data/parquet/dataframe.parquet")

## Hyperparameters

In [6]:
test_set_perc = 0.1
days_for_test = 7
k_cv = 5

## Database split

As we have partitioned the data by city and creation date, we can use this information to split the data. This will help to avoid data leakage, as we will not have data from the future in the training set.
This is much better than just sorting the data by the creation timestamp and taking 10% of the dataset as test set, as we did before.

In [7]:
# We take the last week of the dataset to test the model
begin_test_date = pd.to_datetime(end_date) - pd.Timedelta(days=days_for_test-1)
begin_test_date = begin_test_date.strftime("%Y-%m-%d")
print(f'Start date: {start_date} | Begin test date: {begin_test_date} | End date: {end_date}')

Start date: 2024-09-30 | Begin test date: 2024-10-14 | End date: 2024-10-20


In [8]:
X_train = pd.read_parquet("data/parquet/dataframe.parquet/", filters=[('creation_date', '<', begin_test_date)])
X_train.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,creation_hour,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100901312508,24750184,2024-09-30 20:48:58+00:00,2024-09-30 20:48:59+00:00,CAR,2024-09-30 21:05:11.892000+00:00,2024-09-30 21:17:34+00:00,2024-09-30 21:20:22+00:00,38.391617,...,20,0.67006,-0.008959,0.670057,-0.009104,724.460401,724.460401,742.866745,2024-09-30,ALC
1,ES,100901502888,114878043,2024-09-30 22:02:24+00:00,2024-09-30 22:02:25+00:00,MOTORBIKE,2024-09-30 22:18:10.583000+00:00,2024-09-30 22:22:58+00:00,2024-09-30 22:23:44+00:00,38.33844,...,22,0.669132,-0.008492,0.669226,-0.008693,1166.853169,1166.853169,1601.094467,2024-09-30,ALC
2,ES,100901552273,169065712,2024-09-30 22:29:26+00:00,2024-09-30 22:29:26+00:00,CAR,2024-09-30 22:42:27.608000+00:00,2024-09-30 22:50:43+00:00,2024-09-30 22:56:36+00:00,38.366512,...,22,0.669622,-0.0082,0.669375,-0.008726,3065.108653,3065.108653,4208.852129,2024-09-30,ALC
3,ES,100901587119,98812119,2024-09-30 22:53:13+00:00,2024-09-30 22:53:13+00:00,CAR,2024-09-30 23:13:58.938000+00:00,2024-09-30 23:16:54+00:00,2024-09-30 23:21:06+00:00,38.359184,...,22,0.669494,-0.008437,0.66946,-0.00837,400.794655,400.794655,555.236086,2024-09-30,ALC
4,ES,100901593554,170866593,2024-09-30 22:58:15+00:00,2024-09-30 22:58:17+00:00,MOTORBIKE,2024-09-30 23:08:24.577000+00:00,2024-09-30 23:14:41+00:00,2024-09-30 23:16:10+00:00,38.345978,...,22,0.669264,-0.008514,0.669439,-0.008415,1225.327574,1225.327574,1618.61027,2024-09-30,ALC


In [9]:
# Check that there are no nulls deriving from a wrong writing of parquet files (appending instead of overwriting)
X_train.isnull().sum().sum()

np.int64(0)

In [10]:
y_train = X_train['delivery_entering_timestamp'] - X_train['pickup_timestamp']
y_train = pd.Series(y_train, name='pickup_to_delivery')
y_train

0        0 days 00:12:22.108000
1        0 days 00:04:47.417000
2        0 days 00:08:15.392000
3        0 days 00:02:55.062000
4        0 days 00:06:16.423000
                  ...          
678013   0 days 00:04:27.832000
678014   0 days 00:05:29.981000
678015   0 days 00:12:19.778000
678016   0 days 00:08:33.237000
678017   0 days 00:09:04.232000
Name: pickup_to_delivery, Length: 678018, dtype: timedelta64[ns]

In [11]:
y_train.isnull().sum().sum()

np.int64(0)

In [12]:
X_test = pd.read_parquet("data/parquet/dataframe.parquet", filters=[('creation_date', '>=', begin_test_date)])
X_test.head()

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,creation_hour,pickup_latitude_rad,pickup_longitude_rad,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code
0,ES,100931299686,172175095,2024-10-14 00:09:32+00:00,2024-10-14 00:09:34+00:00,MOTORBIKE,2024-10-14 00:28:54.317000+00:00,2024-10-14 00:34:47+00:00,2024-10-14 00:36:48+00:00,38.38252,...,0,0.669901,-0.007882,0.6695,-0.008304,3310.852444,3310.852444,4665.700375,2024-10-14,ALC
1,ES,100931323115,170201413,2024-10-14 00:50:44+00:00,2024-10-14 10:12:43+00:00,CAR,2024-10-14 10:36:43.867000+00:00,2024-10-14 10:48:41+00:00,2024-10-14 10:53:34+00:00,38.348476,...,0,0.669307,-0.008489,0.669187,-0.008794,1705.517814,1705.517814,2292.240241,2024-10-14,ALC
2,ES,100931293587,2320936,2024-10-14 00:00:33+00:00,2024-10-14 00:00:34+00:00,CAR,2024-10-14 00:11:05.783000+00:00,2024-10-14 00:16:49+00:00,2024-10-14 00:17:38+00:00,38.425022,...,0,0.670643,-0.007077,0.670889,-0.006727,2345.425073,2345.425073,3315.503069,2024-10-14,ALC
3,ES,100931297084,48156385,2024-10-14 00:05:52+00:00,2024-10-14 00:05:53+00:00,MOTORBIKE,2024-10-14 00:25:18.815000+00:00,2024-10-14 00:33:40+00:00,2024-10-14 00:40:31+00:00,38.348465,...,0,0.669307,-0.008617,0.66943,-0.008205,2201.177479,2201.177479,2843.624975,2024-10-14,ALC
4,ES,100931307238,177355044,2024-10-14 00:20:58+00:00,2024-10-14 00:20:59+00:00,MOTORBIKE,2024-10-14 00:39:33.601000+00:00,2024-10-14 00:51:56+00:00,2024-10-14 00:57:26+00:00,38.33887,...,0,0.66914,-0.009017,0.669468,-0.008548,3138.707723,3138.707723,4436.739653,2024-10-14,ALC


In [13]:
X_test.isnull().sum().sum()

np.int64(0)

In [14]:
y_test = (X_test['delivery_entering_timestamp'] - X_test['pickup_timestamp']).dt.total_seconds()
y_test = pd.Series(y_test, dtype=np.float64, name='pickup_to_delivery')
y_test

0         352.683
1         717.133
2         343.217
3         501.185
4         742.399
           ...   
294448    155.306
294449    371.268
294450    742.575
294451    786.619
294452    194.149
Name: pickup_to_delivery, Length: 294453, dtype: float64

In [15]:
y_test.isnull().sum().sum()

np.int64(0)

In [16]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(678018, 26) (678018,)
(294453, 26) (294453,)


## Baseline Models

### BaselineModel_sum

In [17]:
model_bl_sum = BaselineModel_sum()
model_bl_sum.fit(X_train, y_train)

<estimator.BaselineModel_sum at 0x11e7de480>

In [18]:
X_test_expanded = X_test.copy()
X_test_expanded['y_test_predicted'] = model_bl_sum.predict(X_test)
X_test_expanded['y_test'] = (X_test_expanded['delivery_entering_timestamp'] - X_test_expanded['pickup_timestamp']).dt.total_seconds()
X_test_expanded['diff'] = X_test_expanded['y_test_predicted'] - X_test_expanded['y_test']
X_test_expanded

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,y_test_predicted,y_test,diff
0,ES,100931299686,172175095,2024-10-14 00:09:32+00:00,2024-10-14 00:09:34+00:00,MOTORBIKE,2024-10-14 00:28:54.317000+00:00,2024-10-14 00:34:47+00:00,2024-10-14 00:36:48+00:00,38.382520,...,0.669500,-0.008304,3310.852444,3310.852444,4665.700375,2024-10-14,ALC,1058.713967,352.683,706.030967
1,ES,100931323115,170201413,2024-10-14 00:50:44+00:00,2024-10-14 10:12:43+00:00,CAR,2024-10-14 10:36:43.867000+00:00,2024-10-14 10:48:41+00:00,2024-10-14 10:53:34+00:00,38.348476,...,0.669187,-0.008794,1705.517814,1705.517814,2292.240241,2024-10-14,ALC,481.340514,717.133,-235.792486
2,ES,100931293587,2320936,2024-10-14 00:00:33+00:00,2024-10-14 00:00:34+00:00,CAR,2024-10-14 00:11:05.783000+00:00,2024-10-14 00:16:49+00:00,2024-10-14 00:17:38+00:00,38.425022,...,0.670889,-0.006727,2345.425073,2345.425073,3315.503069,2024-10-14,ALC,661.938621,343.217,318.721621
3,ES,100931297084,48156385,2024-10-14 00:05:52+00:00,2024-10-14 00:05:53+00:00,MOTORBIKE,2024-10-14 00:25:18.815000+00:00,2024-10-14 00:33:40+00:00,2024-10-14 00:40:31+00:00,38.348465,...,0.669430,-0.008205,2201.177479,2201.177479,2843.624975,2024-10-14,ALC,703.872305,501.185,202.687305
4,ES,100931307238,177355044,2024-10-14 00:20:58+00:00,2024-10-14 00:20:59+00:00,MOTORBIKE,2024-10-14 00:39:33.601000+00:00,2024-10-14 00:51:56+00:00,2024-10-14 00:57:26+00:00,38.338870,...,0.669468,-0.008548,3138.707723,3138.707723,4436.739653,2024-10-14,ALC,1003.667110,742.399,261.268110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294448,ES,100944442019,176937248,2024-10-20 01:52:02+00:00,2024-10-20 01:52:03+00:00,BICYCLE,2024-10-20 02:03:27.694000+00:00,2024-10-20 02:06:03+00:00,2024-10-20 02:08:10+00:00,37.381660,...,0.652397,-0.104685,344.207294,344.207294,486.272035,2024-10-20,SEV,129.421560,155.306,-25.884440
294449,ES,100944429020,177365485,2024-10-20 01:22:22+00:00,2024-10-20 01:22:24+00:00,BICYCLE,2024-10-20 01:26:50.732000+00:00,2024-10-20 01:33:02+00:00,2024-10-20 01:33:49+00:00,37.377640,...,0.652338,-0.104664,465.768645,465.768645,595.324438,2024-10-20,SEV,175.128493,371.268,-196.139507
294450,ES,100944376912,175206228,2024-10-20 00:06:02+00:00,2024-10-20 00:06:03+00:00,MOTORBIKE,2024-10-20 00:15:41.425000+00:00,2024-10-20 00:28:04+00:00,2024-10-20 00:30:14+00:00,37.378030,...,0.652579,-0.104711,1338.584915,1338.584915,1474.909884,2024-10-20,SEV,428.040382,742.575,-314.534618
294451,ES,100944400312,174198829,2024-10-20 00:34:03+00:00,2024-10-20 00:34:04+00:00,BICYCLE,2024-10-20 00:37:07.381000+00:00,2024-10-20 00:50:14+00:00,2024-10-20 00:52:55+00:00,37.377640,...,0.652052,-0.104474,2425.338748,2425.338748,3384.627931,2024-10-20,SEV,911.924673,786.619,125.305673


In [19]:
model_bl_sum.predict(X_test.iloc[0])

0    1058.713967
Name: pickup_to_delivery_predicted, dtype: float64

### Evaluation pipeline

In [20]:
model_bl_sum.evaluate(X_test, y_test)

y_hat <class 'pandas.core.series.Series'> float64
y_test <class 'pandas.core.series.Series'> float64


(np.float64(145.70644138823337), np.float64(52878.818095554445))

### BaselineModel_mean

In [21]:
model_bl_mean = BaselineModel_mean()
model_bl_mean.fit(X_train, y_train)

<estimator.BaselineModel_mean at 0x11e9d03b0>

In [22]:
X_test_expanded2 = X_test.copy()
X_test_expanded2['y_test_predicted'] = model_bl_mean.predict(X_test)
X_test_expanded2['y_test'] = (X_test_expanded2['delivery_entering_timestamp'] - X_test_expanded2['pickup_timestamp']).dt.total_seconds()
X_test_expanded2['diff'] = X_test_expanded2['y_test_predicted'] - X_test_expanded2['y_test']
X_test_expanded2

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,delivery_latitude_rad,delivery_longitude_rad,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,y_test_predicted,y_test,diff
0,ES,100931299686,172175095,2024-10-14 00:09:32+00:00,2024-10-14 00:09:34+00:00,MOTORBIKE,2024-10-14 00:28:54.317000+00:00,2024-10-14 00:34:47+00:00,2024-10-14 00:36:48+00:00,38.382520,...,0.669500,-0.008304,3310.852444,3310.852444,4665.700375,2024-10-14,ALC,744.414334,352.683,391.731334
1,ES,100931323115,170201413,2024-10-14 00:50:44+00:00,2024-10-14 10:12:43+00:00,CAR,2024-10-14 10:36:43.867000+00:00,2024-10-14 10:48:41+00:00,2024-10-14 10:53:34+00:00,38.348476,...,0.669187,-0.008794,1705.517814,1705.517814,2292.240241,2024-10-14,ALC,687.926270,717.133,-29.206730
2,ES,100931293587,2320936,2024-10-14 00:00:33+00:00,2024-10-14 00:00:34+00:00,CAR,2024-10-14 00:11:05.783000+00:00,2024-10-14 00:16:49+00:00,2024-10-14 00:17:38+00:00,38.425022,...,0.670889,-0.006727,2345.425073,2345.425073,3315.503069,2024-10-14,ALC,946.034987,343.217,602.817987
3,ES,100931297084,48156385,2024-10-14 00:05:52+00:00,2024-10-14 00:05:53+00:00,MOTORBIKE,2024-10-14 00:25:18.815000+00:00,2024-10-14 00:33:40+00:00,2024-10-14 00:40:31+00:00,38.348465,...,0.669430,-0.008205,2201.177479,2201.177479,2843.624975,2024-10-14,ALC,494.914254,501.185,-6.270746
4,ES,100931307238,177355044,2024-10-14 00:20:58+00:00,2024-10-14 00:20:59+00:00,MOTORBIKE,2024-10-14 00:39:33.601000+00:00,2024-10-14 00:51:56+00:00,2024-10-14 00:57:26+00:00,38.338870,...,0.669468,-0.008548,3138.707723,3138.707723,4436.739653,2024-10-14,ALC,705.709197,742.399,-36.689803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294448,ES,100944442019,176937248,2024-10-20 01:52:02+00:00,2024-10-20 01:52:03+00:00,BICYCLE,2024-10-20 02:03:27.694000+00:00,2024-10-20 02:06:03+00:00,2024-10-20 02:08:10+00:00,37.381660,...,0.652397,-0.104685,344.207294,344.207294,486.272035,2024-10-20,SEV,113.126948,155.306,-42.179052
294449,ES,100944429020,177365485,2024-10-20 01:22:22+00:00,2024-10-20 01:22:24+00:00,BICYCLE,2024-10-20 01:26:50.732000+00:00,2024-10-20 01:33:02+00:00,2024-10-20 01:33:49+00:00,37.377640,...,0.652338,-0.104664,465.768645,465.768645,595.324438,2024-10-20,SEV,153.079224,371.268,-218.188776
294450,ES,100944376912,175206228,2024-10-20 00:06:02+00:00,2024-10-20 00:06:03+00:00,MOTORBIKE,2024-10-20 00:15:41.425000+00:00,2024-10-20 00:28:04+00:00,2024-10-20 00:30:14+00:00,37.378030,...,0.652579,-0.104711,1338.584915,1338.584915,1474.909884,2024-10-20,SEV,300.968350,742.575,-441.606650
294451,ES,100944400312,174198829,2024-10-20 00:34:03+00:00,2024-10-20 00:34:04+00:00,BICYCLE,2024-10-20 00:37:07.381000+00:00,2024-10-20 00:50:14+00:00,2024-10-20 00:52:55+00:00,37.377640,...,0.652052,-0.104474,2425.338748,2425.338748,3384.627931,2024-10-20,SEV,797.110276,786.619,10.491276


In [23]:
model_bl_mean.predict(X_test.iloc[0])

0    744.414334
Name: pickup_to_delivery_predicted, dtype: float64

### Evaluation pipeline

In [24]:
model_bl_mean.evaluate(X_test, y_test)

(np.float64(162.8336541959364), np.float64(65601.18015357893))

## Linear Model

In [25]:
# Train on a small subset to check if the model is working
X_train_smaller = X_train.head(1000)
y_train_smaller = y_train.head(1000)

### LinearModel_encodingdummyvariables

In [26]:
model_linear = LinearModel_encode_timestamps_dummy_variables()
model_linear.fit(X_train_smaller, y_train_smaller)
# model_linear.fit(X_train, y_train)

INFO:root:Starting to encode variables
INFO:root:Finished to encode variables. Starting to fit the model
INFO:root:Finished training the model


<estimator.LinearModel_encode_timestamps_dummy_variables at 0x11e9d25d0>

In [27]:
X_test_expanded3 = X_train_smaller.copy()
X_test_expanded3['y_test_predicted'] = model_linear.predict(X_test_expanded3)
X_test_expanded3['y_test'] = (X_test_expanded3['delivery_entering_timestamp'] - X_test_expanded3['pickup_timestamp']).dt.total_seconds()
X_test_expanded3['diff'] = X_test_expanded3['y_test_predicted'] - X_test_expanded3['y_test']
X_test_expanded3

Unnamed: 0,country_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_entering_timestamp,delivery_timestamp,pickup_latitude,...,pd_distance_haversine_m,pd_distance_haversine_m_sk,pd_distance_manhattan_m,creation_date,city_code,time,velocity,y_test_predicted,y_test,diff
0,ES,100901312508,24750184,2024-09-30 20:48:58+00:00,2024-09-30 20:48:59+00:00,CAR,2024-09-30 21:05:11.892000+00:00,2024-09-30 21:17:34+00:00,2024-09-30 21:20:22+00:00,38.391617,...,724.460401,724.460401,742.866745,2024-09-30,ALC,742.108,0.976220,7.421080e+11,742.108,7.421080e+11
1,ES,100901502888,114878043,2024-09-30 22:02:24+00:00,2024-09-30 22:02:25+00:00,MOTORBIKE,2024-09-30 22:18:10.583000+00:00,2024-09-30 22:22:58+00:00,2024-09-30 22:23:44+00:00,38.338440,...,1166.853169,1166.853169,1601.094467,2024-09-30,ALC,287.417,4.059792,2.874170e+11,287.417,2.874170e+11
2,ES,100901552273,169065712,2024-09-30 22:29:26+00:00,2024-09-30 22:29:26+00:00,CAR,2024-09-30 22:42:27.608000+00:00,2024-09-30 22:50:43+00:00,2024-09-30 22:56:36+00:00,38.366512,...,3065.108653,3065.108653,4208.852129,2024-09-30,ALC,495.392,6.187239,4.953920e+11,495.392,4.953920e+11
3,ES,100901587119,98812119,2024-09-30 22:53:13+00:00,2024-09-30 22:53:13+00:00,CAR,2024-09-30 23:13:58.938000+00:00,2024-09-30 23:16:54+00:00,2024-09-30 23:21:06+00:00,38.359184,...,400.794655,400.794655,555.236086,2024-09-30,ALC,175.062,2.289444,1.750620e+11,175.062,1.750620e+11
4,ES,100901593554,170866593,2024-09-30 22:58:15+00:00,2024-09-30 22:58:17+00:00,MOTORBIKE,2024-09-30 23:08:24.577000+00:00,2024-09-30 23:14:41+00:00,2024-09-30 23:16:10+00:00,38.345978,...,1225.327574,1225.327574,1618.610270,2024-09-30,ALC,376.423,3.255188,3.764230e+11,376.423,3.764230e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ES,100901081034,3482453,2024-09-30 19:31:20+00:00,2024-09-30 19:31:21+00:00,MOTORBIKE,2024-09-30 19:40:53.687000+00:00,2024-09-30 19:43:41+00:00,2024-09-30 19:45:13+00:00,38.354620,...,756.867908,756.867908,935.259984,2024-09-30,ALC,167.313,4.523665,1.673130e+11,167.313,1.673130e+11
996,ES,100901265266,115266734,2024-09-30 20:33:20+00:00,2024-09-30 20:33:22+00:00,MOTORBIKE,2024-09-30 20:45:25.779000+00:00,2024-09-30 20:49:48+00:00,2024-09-30 20:52:45+00:00,38.355000,...,356.361182,356.361182,425.074985,2024-09-30,ALC,262.221,1.359011,2.622210e+11,262.221,2.622210e+11
997,ES,100901307260,165064916,2024-09-30 20:47:14+00:00,2024-09-30 20:47:15+00:00,CAR,2024-09-30 20:57:33.623000+00:00,2024-09-30 21:08:14+00:00,2024-09-30 21:13:02+00:00,38.369640,...,1396.380228,1396.380228,1940.700487,2024-09-30,ALC,640.377,2.180560,6.403770e+11,640.377,6.403770e+11
998,ES,100901418525,85950440,2024-09-30 21:26:08+00:00,2024-09-30 21:26:09+00:00,CAR,2024-09-30 21:34:30.387000+00:00,2024-09-30 21:41:24+00:00,2024-09-30 21:47:30+00:00,38.362377,...,1582.519960,1582.519960,2235.150403,2024-09-30,ALC,413.613,3.826089,4.136130e+11,413.613,4.136130e+11


In [28]:
model_linear.predict(X_test.iloc[[0]])

array([-219548.26080978])

In [29]:
model_linear.evaluate(X_test, y_test)

(np.float64(195159.0350077124), np.float64(70872413588.1494))

#### Train on full data

In [30]:
start = time.time()
start

1743085245.2417762

In [None]:
model_linear = LinearModel_encode_timestamps_dummy_variables()
model_linear.fit(X_train, y_train)

INFO:root:Starting to encode variables


In [None]:
end = time.time()
end

In [ ]:
end - start

In [ ]:
model_linear.predict(X_test.iloc[[0]])

In [ ]:
model_linear.evaluate(X_test, y_test)

### LinearModel_cyclical_encoding

In [None]:
model_linear = LinearModel_encode_timestamps_cyclical()
# model_linear.fit(X_train_smaller, y_train_smaller)
model_linear.fit(X_train, y_train)

In [None]:
# X_test_expanded3 = X_train_smaller.copy()
X_test_expanded3 = X_train.copy()
X_test_expanded3['y_test_predicted'] = model_linear.predict(X_test_expanded3)
X_test_expanded3['y_test'] = (X_test_expanded3['delivery_entering_timestamp'] - X_test_expanded3['pickup_timestamp']).dt.total_seconds()
X_test_expanded3['diff'] = X_test_expanded3['y_test_predicted'] - X_test_expanded3['y_test']

X_test_expanded3

In [None]:
model_linear.predict(X_test.iloc[[0]])

In [None]:
model_linear.evaluate(X_test, y_test)

### LinearModel

Compare performance of the model with and without standardization

In [None]:
model_linear = LinearModel(standardize=True)
model_linear.fit(X_train, y_train)