In [1]:
import pandas as pd
import numpy as np
import os

# Paths
PROCESSED_PATH = "../data/processed/tabular/"
OUTPUT_FILE = os.path.join(PROCESSED_PATH, "supply_chain_features.csv")


# Load integrated dataset
df = pd.read_csv(os.path.join(PROCESSED_PATH, "integrated_supply_chain.csv"))
print("✅ Loaded integrated dataset:", df.shape)
df.head()

✅ Loaded integrated dataset: (35065, 9)


Unnamed: 0,shipment_id,origin,destination,dispatch_date,delivery_date,delay_days,disruption_type,risk_score,source
0,O1000,B33,S23,2023-10-27 00:00:00,2023-10-28,0.0,,0.0,resilience
1,O1001,B1,S20,2023-07-08 00:00:00,2023-07-09,0.0,,0.0,resilience
2,O1002,B2,S10,2023-12-29 00:00:00,2024-01-07,7.0,Shortage,1.0,resilience
3,O1003,B6,S10,2023-01-17 00:00:00,2023-01-20,0.0,,0.0,resilience
4,O1004,B5,S4,2023-01-14 00:00:00,2023-01-16,0.0,,0.0,resilience


Lead time

In [2]:
#datetime conversion
df["dispatch_date"] = pd.to_datetime(df["dispatch_date"], errors = "coerce")
df["delivery_date"] = pd.to_datetime(df["delivery_date"], errors = "coerce")

#Lead time = delivery date - dispatch date
df["lead_time_days"] = df["delivery_date"] - df["dispatch_date"]
df["lead_time_days"] = df["lead_time_days"].dt.days

Delay Severity

In [4]:
def categorize_delay(x):
    if pd.isna(x):
        return "None"
    elif x<=0:
        return "On Time"
    elif x<=2:
        return "Minor"
    elif x<=7:
        return "Moderate"
    else:
        return "Severe"
    
df["delay_severity"] = df["lead_time_days"].apply(categorize_delay)

Temporal Features

In [5]:
df["month"] = df["dispatch_date"].dt.month
df["weekday"] = df["dispatch_date"].dt.weekday
df["quarter"] = df["dispatch_date"].dt.quarter
df["year"] = df["dispatch_date"].dt.year

Route Risk Score

In [6]:
# Frequency of disruptions per (origin, destination)
route_disruption = (
    df.groupby(["origin", "destination"])["disruption_type"].apply(lambda x: (x!="None").mean())
    .reset_index()
    .rename(columns={"disruption_type": "route_risk_score"})
)

df = df.merge(route_disruption, on = ["origin", "destination"], how = "left")

In [8]:
# 🔹 5. Cargo2000 Features (Skipped Merge)
c2k_file = os.path.join(PROCESSED_PATH, "cargo2000_features.csv")

if os.path.exists(c2k_file):
    print("ℹ️ Cargo2000 features exist, but merge skipped (different IDs).")
    print(f"You can still use {c2k_file} separately for event-delay models.")
else:
    print("⚠️ No Cargo2000 features found.")


ℹ️ Cargo2000 features exist, but merge skipped (different IDs).
You can still use ../data/processed/tabular/cargo2000_features.csv separately for event-delay models.


In [9]:
df.to_csv(OUTPUT_FILE, index=False)
print(f"💾 Saved enriched feature dataset → {OUTPUT_FILE}")
df.head()

💾 Saved enriched feature dataset → ../data/processed/tabular/supply_chain_features.csv


Unnamed: 0,shipment_id,origin,destination,dispatch_date,delivery_date,delay_days,disruption_type,risk_score,source,lead_time_days,delay_severity,month,weekday,quarter,year,route_risk_score
0,O1000,B33,S23,2023-10-27,2023-10-28,0.0,,0.0,resilience,1.0,Minor,10,4,4,2023,1.0
1,O1001,B1,S20,2023-07-08,2023-07-09,0.0,,0.0,resilience,1.0,Minor,7,5,3,2023,1.0
2,O1002,B2,S10,2023-12-29,2024-01-07,7.0,Shortage,1.0,resilience,9.0,Severe,12,4,4,2023,1.0
3,O1003,B6,S10,2023-01-17,2023-01-20,0.0,,0.0,resilience,3.0,Moderate,1,1,1,2023,1.0
4,O1004,B5,S4,2023-01-14,2023-01-16,0.0,,0.0,resilience,2.0,Minor,1,5,1,2023,1.0
