In [None]:
# import necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [None]:
# load encoded data for modelling
model_path = "DataCoSupplyChainDataset_cleaned.csv"
df_model = pd.read_csv(model_path)

print(df_model.shape)
df_model.head()

In [None]:
# One-hot encode categorical variables
cat_cols = ["Shipping Mode"]

df_model = pd.get_dummies(
    df_model,
    columns=cat_cols,
    drop_first=True
)

In [None]:
# Split features and target variable

#Target variable
y = df_model["Late_delivery_risk"]

#Features
x = df_model.drop(columns=["Late_delivery_risk", "Delivery Status", "Order Country", "Order Region"])

x.shape, y.shape

In [None]:
#split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    x, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

X_train.shape, X_test.shape

In [None]:
# Standardize numerical features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#train logistic regression model
log_reg = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

log_reg.fit(X_train_scaled, y_train)

In [None]:
# Make predictions and evaluate the model
y_pred = log_reg.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}\n")

print("Classification report:\n")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix - shows the performance of the classification model
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix â€“ Logistic Regression")
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
coefs = pd.DataFrame({
    "feature": x.columns,
    "coefficient": log_reg.coef_[0]
})

coefs_sorted = coefs.sort_values("coefficient", ascending=False)
coefs_sorted.head(10)

In [None]:
top_pos = coefs_sorted.head(10)
top_neg = coefs_sorted.tail(10)

plt.figure(figsize=(8,5))
plt.barh(top_pos["feature"], top_pos["coefficient"])
plt.title("Top Positive Coefficients (Increase Late Delivery Risk)")
plt.xlabel("Coefficient")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
plt.barh(top_neg["feature"], top_neg["coefficient"])
plt.title("Top Negative Coefficients (Decrease Late Delivery Risk)")
plt.xlabel("Coefficient")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()