# Pickup to Delivery Overall

In [1]:
import os
import sys
import warnings
import pandas as pd
import numpy as np
import csv
import pickle
import matplotlib.pyplot as plt
from haversine import haversine, Unit
from sklearn.metrics.pairwise import haversine_distances, manhattan_distances
from sklearn.model_selection import train_test_split, cross_val_score

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils

In [2]:
base_query_path = './queries/'
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
datalake_connection = qr.create_connection(db='datalake')
#monolith_connection = qr.create_connection(user=livedb_config['monolith_username'], password=livedb_config['monolith_password'], db='livedb')
#dispatching_db_connection = qr.create_connection(user=livedb_config['dispatching_db_username'], password=livedb_config['dispatching_db_password'], db='dispatchingdb')

In [3]:
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code}')

Start date: 2024-09-23 | End date: 2024-10-06 | Countries: ES


In [4]:
parameters = {
    'start_date': start_date,
    'end_date': end_date,
    'country_code': country_code
}

## Load the data

In [5]:
query_name = '''
SELECT
    olf.country_code                                 AS country_code,
    olf.city_code                                    AS city_code,
    olf.order_id                                     AS order_id,
    olf.courier_id                                   AS courier_id,
    olf.order_created_local_datetime                 AS creation_timestamp,
    olf.order_activated_local_datetime               AS activation_timestamp,
    olf.courier_transport                            AS transport,
    olf.order_picked_up_local_datetime               AS pickup_timestamp,
    olf.order_delivered_local_datetime               AS delivery_timestamp,
    olf.order_pickup_latitude                        AS pickup_latitude,
    olf.order_pickup_longitude                       AS pickup_longitude,
    olf.order_delivery_latitude                      AS delivery_latitude,
    olf.order_delivery_longitude                     AS delivery_longitude,
    olf.order_arrival_to_delivery_local_datetime     AS delivery_entering_timestamp,
    olf.order_time_zone                              AS time_zone,
    olf.p_creation_date
FROM delta.courier_routing_courier_ml_features_odp.order_level_features AS olf
WHERE order_final_status = 'DeliveredStatus'
    AND order_number_of_assignments = 1
    AND order_bundle_index IS NULL
    AND p_creation_date >= DATE '[start_date]' AND p_creation_date < DATE '[end_date]'
    AND country_code IN ('[country_code]')
'''

query = qr.Query(base_query_path, query_name, datalake_connection, parameters_dict=parameters, query_from_file = False)

df = query.run()
df = df.fillna(value=np.nan)

data = df.copy()
data.head()

Open the following URL in browser for the external authentication:
https://starburst.g8s-data-platform-prod.glovoint.com/oauth2/token/initiate/7ce2cca5cbbd6b6f7c095a9644bd84988b8d555476b70bf1177ea118d3591b2b


Unnamed: 0,country_code,city_code,order_id,courier_id,creation_timestamp,activation_timestamp,transport,pickup_timestamp,delivery_timestamp,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,delivery_entering_timestamp,time_zone,p_creation_date
0,ES,ZAR,100894902837,175840697,2024-09-28 11:08:38+00:00,2024-09-28 11:08:41+00:00,MOTORBIKE,2024-09-28 11:17:58.096000+00:00,2024-09-28 11:31:17+00:00,41.64898,-0.877411,41.653435,-0.869934,2024-09-28 11:25:47+00:00,Europe/Madrid,2024-09-28
1,ES,CAL,100894903481,168755278,2024-09-28 11:09:02+00:00,2024-09-28 11:38:48+00:00,MOTORBIKE,2024-09-28 12:01:07.283000+00:00,2024-09-28 12:08:46+00:00,41.6754,2.789956,41.67582,2.794698,2024-09-28 12:05:23+00:00,Europe/Madrid,2024-09-28
2,ES,BCN,100894905484,146788592,2024-09-28 11:10:20+00:00,2024-09-28 11:10:21+00:00,MOTORBIKE,2024-09-28 11:14:32.118000+00:00,2024-09-28 11:19:04+00:00,41.357388,2.127895,41.360992,2.128997,2024-09-28 11:16:59+00:00,Europe/Madrid,2024-09-28
3,ES,NOM,100894909339,167720126,2024-09-28 11:12:44+00:00,2024-09-28 11:12:44+00:00,CAR,2024-09-28 11:44:13.154000+00:00,2024-09-28 11:58:59+00:00,40.439137,-3.792211,40.45934,-3.781351,2024-09-28 11:54:08+00:00,Europe/Madrid,2024-09-28
4,ES,NOM,100894914820,176602268,2024-09-28 11:16:03+00:00,2024-09-28 11:16:04+00:00,CAR,2024-09-28 11:31:20.290000+00:00,2024-09-28 11:42:54+00:00,40.436962,-3.796514,40.402916,-3.786705,2024-09-28 11:40:08+00:00,Europe/Madrid,2024-09-28


## Clean the dataset

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1536205 entries, 0 to 36204
Data columns (total 16 columns):
 #   Column                       Non-Null Count    Dtype              
---  ------                       --------------    -----              
 0   country_code                 1536205 non-null  object             
 1   city_code                    1536205 non-null  object             
 2   order_id                     1536205 non-null  int64              
 3   courier_id                   1536205 non-null  int64              
 4   creation_timestamp           1536205 non-null  datetime64[ns, UTC]
 5   activation_timestamp         1536205 non-null  datetime64[ns, UTC]
 6   transport                    1536205 non-null  object             
 7   pickup_timestamp             1535647 non-null  datetime64[ns, UTC]
 8   delivery_timestamp           1536205 non-null  datetime64[ns, UTC]
 9   pickup_latitude              1536205 non-null  float64            
 10  pickup_longitude         

In [7]:
data.describe()

Unnamed: 0,order_id,courier_id,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude
count,1536205.0,1536205.0,1536205.0,1536205.0,1536205.0,1536205.0
mean,100898900000.0,130842400.0,39.79646,-2.437437,39.79665,-2.437684
std,8173440.0,50699620.0,3.270964,4.486772,3.270975,4.486684
min,100884700000.0,11600.0,27.73856,-17.78775,27.73566,-17.7995
25%,100891900000.0,107914400.0,39.46703,-3.810658,39.46503,-3.814944
50%,100898700000.0,154943400.0,40.46509,-2.994206,40.4681,-2.994632
75%,100906200000.0,168701900.0,41.41026,2.081995,41.41481,2.078547
max,100912900000.0,178930200.0,43.57925,4.290346,43.58983,4.297597


In [8]:
# Check for missing values
data.isnull().sum()

country_code                       0
city_code                          0
order_id                           0
courier_id                         0
creation_timestamp                 0
activation_timestamp               0
transport                          0
pickup_timestamp                 558
delivery_timestamp                 0
pickup_latitude                    0
pickup_longitude                   0
delivery_latitude                  0
delivery_longitude                 0
delivery_entering_timestamp    33946
time_zone                          0
p_creation_date                    0
dtype: int64

In [9]:
# Remove rows with null values: we have 1.5 M rows, so we can afford to remove them
data = data.dropna()

In [10]:
# Check for missing values
data.isnull().sum()

country_code                   0
city_code                      0
order_id                       0
courier_id                     0
creation_timestamp             0
activation_timestamp           0
transport                      0
pickup_timestamp               0
delivery_timestamp             0
pickup_latitude                0
pickup_longitude               0
delivery_latitude              0
delivery_longitude             0
delivery_entering_timestamp    0
time_zone                      0
p_creation_date                0
dtype: int64

In [11]:
# Check for duplicates
data.duplicated().sum()

np.int64(0)

## Compute new features

In [12]:
# Convert the creation time to datetime
data['creation_timestamp'] = pd.to_datetime(data['creation_timestamp'])
data['activation_timestamp'] = pd.to_datetime(data['activation_timestamp'])
data['pickup_timestamp'] = pd.to_datetime(data['pickup_timestamp'])
data['delivery_timestamp'] = pd.to_datetime(data['delivery_timestamp'])
data['delivery_entering_timestamp'] = pd.to_datetime(data['delivery_entering_timestamp'])

# Compute the delivery date and the delivery time
data['creation_date'] = data['creation_timestamp'].dt.date
data['creation_time'] = data['creation_timestamp'].dt.time
data['creation_hour'] = data['creation_timestamp'].dt.hour

In [None]:
# Compute the distance between the pickup and delivery points
data['pd_distance_haversine_m'] = data.apply(lambda x: haversine((x['pickup_latitude'], x['pickup_longitude']), (x['delivery_latitude'], x['delivery_longitude']), unit=Unit.METERS), axis=1)
data['pd_distance_haversine_m_sk'] = data.apply(lambda x: haversine_distances(np.array([[x['pickup_latitude'], x['pickup_longitude']]]), np.array([[x['delivery_latitude'], x['delivery_longitude']]])), axis=1)
data['pd_distance_manhattan_m'] = data.apply(lambda x: manhattan_distances(np.array([[x['pickup_latitude'], x['pickup_longitude']]]), np.array([[x['delivery_latitude'], x['delivery_longitude']]])), axis=1)
data.head()

## Save the dataset

In [ ]:
data.to_pickle("data/dataframe.pkl")

## Exploratory Data Analysis (EDA)

In [ ]:
data = pd.read_pickle("data/dataframe.pkl")

In [None]:
# Histogram of the # of data per day / hour
plt.figure(figsize=(15, 8))
plt.hist(data['creation_timestamp'], bins = 1000)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.hist(data['creation_date'], bins = 14)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.hist(data['creation_hour'])
plt.show()

In [ ]:
# Check the distribution of the transport types
data['transport'].value_counts()

In [ ]:
# Check the distribution of the distances
plt.figure(figsize=(15, 8))
plt.hist(data['pd_distance_haversine_m'], bins = 1000)
plt.show()

## Hyperparameters

In [None]:
test_set_perc = 0.1
#k_cv = 5

## Database split

In [None]:
X = data
y = data['delivery_entering_timestamp'] - data['pickup_timestamp']
y

As we are dealing with a time-series dataset (orders are placed at different times), we will split the data based on the creation timestamp, leaving out the last 10% of the data for testing. This will help to understand the performance of the model on unseen data, as in reality we will have to test the model on data created on day+1 with respect to our training data.

In [None]:
X.sort_values('creation_timestamp', inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_perc, random_state=0)

In [None]:
X_train

In [ ]:
y_train

In [ ]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [ ]:
# In case we want to test different hyperparameters, we will use cross-validation
#scores = cross_val_score(<estimator>, X, y, cv=k_cv)