# Libraries

In [34]:
# Data manipulation libraries
import pandas as pd # Dataframes

# Visualization libraries
import plotly.graph_objects as go # Interactive visualizations
import plotly.express as px # Simplified plotting with Plotly
from plotly.subplots import make_subplots # Complex subplot layouts

# Statistical libraries
from sklearn.model_selection import train_test_split # Split dataset for validation
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error # Error metrics

# Model libraries
import pickle
from sklearn.linear_model import LinearRegression

# Save Model

In [35]:
def train_model(path: str) -> tuple[LinearRegression, dict[str,float]]:
    df = pd.read_csv(path)\
           .drop(columns = ['Order_ID'])\
           .dropna()

    target_col = 'Delivery_Time_min'
    X = df.drop(columns = target_col)
    y = df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size = 0.3, 
        random_state = 42
    )

    model = LinearRegression()
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    error_metrics = {
        'MAPE': mean_absolute_percentage_error(y_test,y_pred),
        'MAE': mean_absolute_error(y_test,y_pred)
    }

    return (model, error_metrics)

In [36]:
model, error_metrics = train_model('data/clean_data_2.csv')
model_scaled, error_metrics_scaled = train_model('data/clean_data_3.csv')

print('Linear Regression Model error metrics')
print(f'MAPE: {error_metrics['MAPE']}')
print(f'MAE: {error_metrics['MAE']}')

Linear Regression Model error metrics
MAPE: 0.09806818588185184
MAE: 5.408311746169921


In [37]:
filename = 'linear_regression_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved to {filename}")

Model saved to linear_regression_model.pkl


# Insights

In [38]:
df = pd.read_csv(
    'data/clean_data_2.csv'
).drop(columns='Order_ID')

pd.DataFrame([model.coef_,model_scaled.coef_], columns=df.columns[:-1], index=['lr_coef','lr_scaled_coef']).transpose()

Unnamed: 0,lr_coef,lr_scaled_coef
Distance_km,2.971801,29.405975
Traffic_Level,6.285202,6.285202
Time_of_Day,0.200792,0.200792
Preparation_Time_min,0.931797,11.181563
Courier_Experience_yrs,-0.607066,-3.035332
Weather_Clear,-4.89594,-4.89594
Weather_Foggy,3.247332,3.247332
Weather_Rainy,-0.308344,-0.308344
Weather_Snowy,4.013738,4.013738
Weather_Windy,-2.056786,-2.056786


## Error

In [39]:
target_col = 'Delivery_Time_min'
X = df.drop(columns = target_col)
y = df[target_col]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size = 0.3, 
    random_state = 42
)

# Add predictions
X_test['Predicted_Delivery_Time_min'] = model.predict(X_test)
X_test['Delivery_Time_min'] = y_test
X_test['Error'] = abs(X_test['Predicted_Delivery_Time_min'] - X_test['Delivery_Time_min'])

X_test.head()

Unnamed: 0,Distance_km,Traffic_Level,Time_of_Day,Preparation_Time_min,Courier_Experience_yrs,Weather_Clear,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Windy,Vehicle_Type_Bike,Vehicle_Type_Car,Vehicle_Type_Scooter,Predicted_Delivery_Time_min,Delivery_Time_min,Error
44,2.09,0.0,3.0,23,2.0,0,0,1,0,0,1,0,0,36.973872,36,0.973872
806,0.59,0.0,1.0,21,0.0,1,0,0,0,0,1,0,0,26.877528,27,0.122472
720,4.53,1.0,0.0,18,4.0,0,0,0,1,0,0,0,1,47.633595,49,1.366405
660,18.72,1.0,1.0,17,0.0,0,0,1,0,0,1,0,0,87.901899,89,1.098101
878,8.5,2.0,2.0,13,3.0,1,0,0,0,0,0,1,0,54.966687,54,0.966687


In [40]:
fig_error_dist = px.scatter(
    data_frame = X_test, 
    x = 'Delivery_Time_min', 
    y = 'Predicted_Delivery_Time_min',
    title = 'True vs Predicted Delivery Time',
    template = 'plotly_dark',
    color_discrete_sequence = px.colors.qualitative.Safe,
)
fig_error_dist.show()

In [41]:
numerical_features = ['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs']
categorical_features = ['Traffic_Level', 'Time_of_Day', 'Weather_Clear', 'Weather_Foggy',
                        'Weather_Rainy', 'Weather_Snowy', 'Weather_Windy', 'Vehicle_Type_Bike',
                        'Vehicle_Type_Car', 'Vehicle_Type_Scooter']

In [42]:
fig_error_num = make_subplots(
    rows = 1, 
    cols = len(numerical_features),
    subplot_titles = numerical_features
).update_layout(
    title_text = 'Error scatter for Numerical Features',
    showlegend = False,
    template = 'plotly_dark'
)

for i in range(len(numerical_features)):
    fig_error_num.add_trace(
        go.Scatter(
            x = X_test[numerical_features[i]],
            y = X_test['Error'],
            name = numerical_features[i],
            marker_color = px.colors.qualitative.Safe[i],
            mode='markers'
        ),
        row = 1, 
        col = i+1,
    )

fig_error_num.show()

In [45]:
fig_error_categ = make_subplots(
    rows = len(categorical_features) // 2,
    cols = 2,
    subplot_titles = categorical_features
).update_layout(
    title_text = 'Distribution of Error per Categorical Feature',
    showlegend = False,
    xaxis={'categoryorder': 'category ascending'},
    template = 'plotly_dark',
    width = 1100,
    height = 200 * len(categorical_features) // 2,
)

for i in range(len(categorical_features)):

    col_name = categorical_features[i]    

    pivot = pd.pivot_table(
        X_test,
        index = col_name,
        values = 'Error',
        observed = False
    ).reset_index()

    pivot[col_name] = pivot[col_name].astype(str)
    
    fig_error_categ.add_trace(
        go.Bar(
            x = pivot.loc[:,col_name],
            y = pivot.loc[:,'Error'],
            name = col_name,
            marker_color = px.colors.qualitative.Safe[i % 4]
        ),
        row = (i // 2) + 1,
        col = (i % 2) + 1,
    ).update_layout(
        yaxis_range = [0, pivot['Error'].max() + 0.1]
    )
fig_error_categ.show()

## Export figures

In [None]:
fig_error_dist.write_image('figures/error_dist.png', scale = 2)

In [46]:
fig_error_num.write_image('figures/error_dist_by_feature_num.png', scale = 2)
fig_error_categ.write_image('figures/error_dist_by_feature_categ.png', scale = 2)