# EDA

In [1]:
import os
import sys
import warnings
import pandas as pd
import numpy as np
import csv
from haversine import haversine, Unit

sys.path.insert(0, os.path.expanduser('./'))
import query_runner as qr
import utils

In [2]:
base_query_path = './queries/'
dwh_config, livedb_config, parameters_config = utils.load_config(config_file='./config.ini')
datalake_connection = qr.create_connection(db='datalake')
#monolith_connection = qr.create_connection(user=livedb_config['monolith_username'], password=livedb_config['monolith_password'], db='livedb')
#dispatching_db_connection = qr.create_connection(user=livedb_config['dispatching_db_username'], password=livedb_config['dispatching_db_password'], db='dispatchingdb')

In [3]:
start_date = parameters_config['start_date']
end_date = parameters_config['end_date']
country_code = parameters_config['country_code']

print(f'Start date: {start_date} | End date: {end_date} | Countries: {country_code}')

Start date: 2024-09-23 | End date: 2024-10-06 | Countries: ES


In [4]:
parameters = {
    'start_date': start_date,
    'end_date': end_date,
    'country_code': country_code
}

## Load the data

In [5]:
query_name = '''
SELECT
    olf.order_id                                     AS order_id,
    olf.courier_id                                   AS courier_id,
    olf.country_code                                 AS country_code,
    olf.city_code                                    AS city_code,
    olf.order_activated_local_datetime               AS activation_time,
    olf.courier_transport                            AS transport,
    olf.order_picked_up_local_datetime               AS pickup_time,
    olf.order_delivered_local_datetime               AS delivery_time,
    olf.order_pickup_latitude                        AS pickup_latitude,
    olf.order_pickup_longitude                       AS pickup_longitude,
    olf.order_delivery_latitude                      AS delivery_latitude,
    olf.order_delivery_longitude                     AS delivery_longitude,
    olf.order_arrival_to_delivery_local_datetime     AS delivery_entering_time,
    olf.order_time_zone                              AS time_zone,
    olf.p_creation_date
FROM delta.courier_routing_courier_ml_features_odp.order_level_features AS olf
WHERE order_final_status = 'DeliveredStatus'
    AND order_number_of_assignments = 1
    AND order_bundle_index IS NULL
    AND p_creation_date >= DATE '2024-09-23' AND p_creation_date < DATE '2024-10-06'
    AND country_code IN ('ES')
'''

query = qr.Query(base_query_path, query_name, datalake_connection, parameters_dict=parameters, query_from_file = False)

df = query.run()
df = df.fillna(value=np.nan)

data = df.copy()
data.head()

Open the following URL in browser for the external authentication:
https://starburst.g8s-data-platform-prod.glovoint.com/oauth2/token/initiate/780b47b9b02016d04d97360e9091335dbb2048e67a75241de5e46151ca65d14b


Unnamed: 0,order_id,courier_id,country_code,city_code,activation_time,transport,pickup_time,delivery_time,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,delivery_entering_time,time_zone,p_creation_date
0,100907487116,8590944,ES,BCN,2024-10-03 20:55:04+00:00,BICYCLE,2024-10-03 21:21:44.215000+00:00,2024-10-03 21:32:33+00:00,41.370594,2.113433,41.375412,2.104837,2024-10-03 21:29:37+00:00,Europe/Madrid,2024-10-03
1,100907489425,162535288,ES,MAD,2024-10-03 20:55:44+00:00,BICYCLE,2024-10-03 21:07:40.941000+00:00,2024-10-03 21:19:10+00:00,40.382927,-3.626392,40.38349,-3.640969,2024-10-03 21:16:09+00:00,Europe/Madrid,2024-10-03
2,100907492383,167053666,ES,MGP,2024-10-03 20:56:36+00:00,MOTORBIKE,2024-10-03 21:06:13.285000+00:00,2024-10-03 21:12:51+00:00,39.528732,2.540573,39.53856,2.560975,2024-10-03 21:12:24+00:00,Europe/Madrid,2024-10-03
3,100907493124,146788593,ES,BCN,2024-10-03 20:56:49+00:00,BICYCLE,2024-10-03 21:07:40.560000+00:00,2024-10-03 21:14:30+00:00,41.389874,2.159761,41.39052,2.148718,2024-10-03 21:12:00+00:00,Europe/Madrid,2024-10-03
4,100907498582,45695767,ES,MAD,2024-10-03 20:58:26+00:00,MOTORBIKE,2024-10-03 21:08:59.224000+00:00,2024-10-03 21:14:21+00:00,40.40972,-3.670494,40.41288,-3.67184,2024-10-03 21:11:21+00:00,Europe/Madrid,2024-10-03


In [7]:
# Compute the distance between the pickup and delivery points
data['pd_distance_m'] = data.apply(lambda x: haversine((x['pickup_latitude'], x['pickup_longitude']), (x['delivery_latitude'], x['delivery_longitude']), unit=Unit.METERS), axis=1)
data.head()

Unnamed: 0,order_id,courier_id,country_code,city_code,activation_time,transport,pickup_time,delivery_time,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,delivery_entering_time,time_zone,p_creation_date,pd_distance_m
0,100907487116,8590944,ES,BCN,2024-10-03 20:55:04+00:00,BICYCLE,2024-10-03 21:21:44.215000+00:00,2024-10-03 21:32:33+00:00,41.370594,2.113433,41.375412,2.104837,2024-10-03 21:29:37+00:00,Europe/Madrid,2024-10-03,895.267385
1,100907489425,162535288,ES,MAD,2024-10-03 20:55:44+00:00,BICYCLE,2024-10-03 21:07:40.941000+00:00,2024-10-03 21:19:10+00:00,40.382927,-3.626392,40.38349,-3.640969,2024-10-03 21:16:09+00:00,Europe/Madrid,2024-10-03,1236.264251
2,100907492383,167053666,ES,MGP,2024-10-03 20:56:36+00:00,MOTORBIKE,2024-10-03 21:06:13.285000+00:00,2024-10-03 21:12:51+00:00,39.528732,2.540573,39.53856,2.560975,2024-10-03 21:12:24+00:00,Europe/Madrid,2024-10-03,2062.906211
3,100907493124,146788593,ES,BCN,2024-10-03 20:56:49+00:00,BICYCLE,2024-10-03 21:07:40.560000+00:00,2024-10-03 21:14:30+00:00,41.389874,2.159761,41.39052,2.148718,2024-10-03 21:12:00+00:00,Europe/Madrid,2024-10-03,924.017061
4,100907498582,45695767,ES,MAD,2024-10-03 20:58:26+00:00,MOTORBIKE,2024-10-03 21:08:59.224000+00:00,2024-10-03 21:14:21+00:00,40.40972,-3.670494,40.41288,-3.67184,2024-10-03 21:11:21+00:00,Europe/Madrid,2024-10-03,369.394256
