# Libraries

In [1]:
# Data manipulation libraries
import pandas as pd # Dataframes
import numpy as np # Numerical operations

# Visualization libraries
import plotly.graph_objects as go # Interactive visualizations
import plotly.express as px # Simplified plotting with Plotly

from plotly.subplots import make_subplots # Complex subplot layouts

# Statistical libraries
import scipy.stats as stats # Statistical tests and distributions

# Dataset description

In [2]:
df = pd.read_csv(
    'data/food_delivery_times.csv',

    # Order ID treated as string to exclude from numerical analysis. 
    dtype = {'Order_ID': str}
) 
df.head()

Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,522,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,738,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,741,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,661,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,412,19.03,Clear,Low,Morning,Bike,16,5.0,68


In [3]:
df.dtypes

Order_ID                   object
Distance_km               float64
Weather                    object
Traffic_Level              object
Time_of_Day                object
Vehicle_Type               object
Preparation_Time_min        int64
Courier_Experience_yrs    float64
Delivery_Time_min           int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
count,1000.0,1000.0,970.0,1000.0
mean,10.05997,16.982,4.579381,56.732
std,5.696656,7.204553,2.914394,22.070915
min,0.59,5.0,0.0,8.0
25%,5.105,11.0,2.0,41.0
50%,10.19,17.0,5.0,55.5
75%,15.0175,23.0,7.0,71.0
max,19.99,29.0,9.0,153.0


# Numeric variables

In [5]:
num_df = df.select_dtypes(include = [np.number])
num_df.head()

Unnamed: 0,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,7.93,12,1.0,43
1,16.42,20,2.0,84
2,9.52,28,1.0,59
3,7.44,5,1.0,37
4,19.03,16,5.0,68


In [6]:
fig_heatmap = px.imshow(
    img = num_df.corr(),
    title = 'Correlation Heatmap of Numerical Features',
    height = 500,
    width = 600,
    template = 'plotly_dark',
    color_continuous_scale = 'Agsunset',
    text_auto = '.2f',
)
fig_heatmap.show()

In [7]:
fig_pairplot = px.scatter_matrix(
    data_frame = num_df,
    title = 'Scatter Matrix of Numerical Features',
    height = 900,
    template = 'plotly_dark',
    color_discrete_sequence = px.colors.qualitative.Safe,
)
fig_pairplot.show()

In [8]:
fig_dist = make_subplots(
    rows = num_df.shape[1] // 2,
    cols = 2,
    subplot_titles = num_df.columns 
).update_layout(
    title_text = "Distribution of Numerical Features",
    showlegend = False,
    xaxis={'categoryorder': 'category ascending'},
    template = 'plotly_dark',
    height = 600,
)

for i in range(num_df.shape[1]):
    fig_dist.add_trace(
        go.Histogram(
            x = num_df.iloc[:, i],
            name = num_df.columns[i],
            marker_color = px.colors.qualitative.Safe[i]
        ),
        row = (i // 2) + 1,
        col = (i % 2) + 1,
    )

fig_dist.show()

In [9]:
def kolmogorov_smirnov_test(data, dist_name, alpha = 0.05, verbose = False):
    dist = getattr(stats, dist_name)
    param = dist.fit(data)

    D, p = stats.kstest(data, dist_name, args = param)
    if verbose:
        print(f'p-value for {dist_name} distribution = ' + str(p))
        if p < alpha:
            print(f'Reject null hypothesis at alpha = {alpha}. Data does not follow {dist_name} distribution.')
        else:
            print(f'Data follows {dist_name} distribution.')
    return p

In [10]:
for i in range(num_df.shape[1]):
    print(f'Feature: {num_df.columns[i]}')

    for dist_name in ['norm', 'uniform', 'gamma']:
        kolmogorov_smirnov_test(num_df.iloc[:, i].dropna(), dist_name, verbose = True)
    print('---')

Feature: Distance_km
p-value for norm distribution = 0.0002072010235332123
Reject null hypothesis at alpha = 0.05. Data does not follow norm distribution.
p-value for uniform distribution = 0.20717998989270037
Data follows uniform distribution.
p-value for gamma distribution = 0.00020081517047715872
Reject null hypothesis at alpha = 0.05. Data does not follow gamma distribution.
---
Feature: Preparation_Time_min
p-value for norm distribution = 2.480592229228519e-07
Reject null hypothesis at alpha = 0.05. Data does not follow norm distribution.
p-value for uniform distribution = 0.05706939112023113
Data follows uniform distribution.
p-value for gamma distribution = 2.994200185671417e-07
Reject null hypothesis at alpha = 0.05. Data does not follow gamma distribution.
---
Feature: Courier_Experience_yrs
p-value for norm distribution = 2.8353579015889568e-12
Reject null hypothesis at alpha = 0.05. Data does not follow norm distribution.
p-value for uniform distribution = 6.274185660160615e

In [11]:
fig_box = make_subplots(
    rows = 1, 
    cols = num_df.shape[1],
).update_layout(
    title_text = "Box plots of Numerical Features",
    showlegend = False,
    template = 'plotly_dark'
)

for i in range(num_df.shape[1]):
    fig_box.add_trace(
        go.Box(
            y = num_df.iloc[:,i],
            name = num_df.columns[i],
            marker_color = px.colors.qualitative.Safe[i]
        ),
        row = 1, 
        col = i+1,
    )

fig_box.show()

# Categorical variables

In [12]:
categ_df = df.select_dtypes(exclude=np.number)\
           .drop(columns=['Order_ID'])
categ_df['Delivery_Time_min'] = df['Delivery_Time_min']

categ_df.head()

Unnamed: 0,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Delivery_Time_min
0,Windy,Low,Afternoon,Scooter,43
1,Clear,Medium,Evening,Bike,84
2,Foggy,Low,Night,Scooter,59
3,Rainy,Medium,Afternoon,Scooter,37
4,Clear,Low,Morning,Bike,68


In [13]:
fig_dist_categ = make_subplots(
    rows = 1, 
    cols = categ_df.shape[1]-1,
    subplot_titles = categ_df.columns[:-1]
).update_layout(
    title_text = "Distribution of Categorical Features and Avg Delivery Time by Category",
    showlegend = False,
    xaxis={'categoryorder': 'category ascending'},
    template = 'plotly_dark'
)

for i in range(categ_df.shape[1]-1):
    fig_dist_categ.add_trace(
        go.Histogram(
            x = categ_df.iloc[:,i],
            name = categ_df.columns[i],
            text = round(categ_df['Delivery_Time_min'].groupby(categ_df.iloc[:,i]).mean()),
            marker_color = px.colors.qualitative.Safe[i]
        ),
        row = 1, 
        col = i+1,
    )

fig_dist_categ.show()

In [14]:
fig_time_by_categ = make_subplots(
    rows = num_df.shape[1] // 2,
    cols = 2,
    subplot_titles = [f'Delivery Time by {col}' for col in categ_df.columns[:-1]]
).update_layout(
    title_text = "Delivery Time by Categorical Features",
    showlegend = False,
    height = 800,
    template = 'plotly_dark'
)

for i in range(categ_df.shape[1]-1):
    fig_time_by_categ.add_trace(
        go.Box(
            x = categ_df.iloc[:,i],
            y = categ_df['Delivery_Time_min'],
            name = categ_df.columns[i],
            marker_color = px.colors.qualitative.Safe[i]
        ),
        row = (i // 2) + 1,
        col = (i % 2) + 1,
    )
fig_time_by_categ.show()

In [15]:
corr_categ = []

for col1 in categ_df.columns[:-1]:
    for col2 in categ_df.columns[:-1]:
        if col1 < col2:
            p_value = stats.chi2_contingency(pd.crosstab(categ_df[col1], categ_df[col2]))[1]
            corr_categ.append([col1, col2, p_value])
            corr_categ.append([col2, col1, p_value])

df_corr_categ = pd.DataFrame(
    corr_categ, 
    columns = ['Feature 1', 'Feature 2', 'p-value']
)
df_corr_categ = df_corr_categ.pivot(
    index='Feature 1', 
    columns='Feature 2', 
    values='p-value'
).fillna(0)


fig_heatmap_categ = px.imshow(
    img = df_corr_categ,
    title = 'Categorical Feature Correlation (Chi-squared test p-value)',
    height = 500,
    width = 600,
    template = 'plotly_dark',
    color_continuous_scale = 'Agsunset',
    text_auto = '.2f',
)

fig_heatmap_categ.show()

# Export figures

In [16]:
fig_heatmap.write_image('figures/heatmap_num.png', scale = 2)
fig_pairplot.write_image('figures/pairplot_num.png', scale = 2)
fig_dist.write_image('figures/distribution_num.png', scale = 2)
fig_box.write_image('figures/boxplots_num.png', scale = 2)

In [17]:
fig_dist_categ.write_image('figures/distribution_categ.png', scale = 2)
fig_time_by_categ.write_image('figures/boxplots_categ.png', scale = 2)
fig_heatmap_categ.write_image('figures/heatmap_categ.png', scale = 2)