In [1]:
import pandas as pd
import os

In [2]:
# Paths
PROCESSED_PATH = "../data/processed/tabular/"
OUTPUT_FILE = os.path.join(PROCESSED_PATH, "integrated_supply_chain.csv")

# Load cleaned datasets
df_res = pd.read_csv(os.path.join(PROCESSED_PATH, "supply_chain_resilience_clean.csv"))
df_us = pd.read_csv(os.path.join(PROCESSED_PATH, "us_supply_chain_risk_clean.csv"))
df_smart = pd.read_csv(os.path.join(PROCESSED_PATH, "smart_logistics_clean.csv"))
df_dyn = pd.read_csv(os.path.join(PROCESSED_PATH, "dynamic_logistics_clean.csv"))

1. Standardize Schemas

In [3]:
# Define unified schema columns
schema_cols = [
    "shipment_id", "origin", "destination",
    "dispatch_date", "delivery_date", "delay_days",
    "disruption_type", "risk_score", "source"
]

# --- Resilience ---
df_res_std = pd.DataFrame({
    "shipment_id": df_res.get("order_id", df_res.index),
    "origin": df_res.get("buyer_id", "Unknown"),
    "destination": df_res.get("supplier_id", "Unknown"),
    "dispatch_date": pd.to_datetime(df_res.get("dispatch_date")),
    "delivery_date": pd.to_datetime(df_res.get("delivery_date")),
    "delay_days": df_res.get("delay_days"),
    "disruption_type": df_res.get("disruption_type", "None"),
    "risk_score": df_res.get("supply_risk_flag", 0),
    "source": "resilience"
})

# --- US Supply Chain Risk ---
df_us_std = pd.DataFrame({
    "shipment_id": df_us.get("order_id", df_us.index),
    "origin": df_us.get("buyer_id", "Unknown"),
    "destination": df_us.get("supplier_id", "Unknown"),
    "dispatch_date": pd.to_datetime(df_us.get("dispatch_date")),
    "delivery_date": pd.to_datetime(df_us.get("delivery_date")),
    "delay_days": df_us.get("delay_days"),
    "disruption_type": df_us.get("disruption_type", "None"),
    "risk_score": df_us.get("supply_risk_flag", 0),
    "source": "us_risk"
})

# --- Smart Logistics ---
df_smart_std = pd.DataFrame({
    "shipment_id": df_smart.get("asset_id", df_smart.index),
    "origin": df_smart.get("latitude"),
    "destination": df_smart.get("longitude"),
    "dispatch_date": pd.to_datetime(df_smart.get("timestamp")),
    "delivery_date": pd.NaT,  # not available
    "delay_days": df_smart.get("logistics_delay"),
    "disruption_type": df_smart.get("traffic_status", "None"),
    "risk_score": df_smart.get("demand_forecast", 0),
    "source": "smart_logistics"
})

# --- Dynamic Logistics ---
df_dyn_std = pd.DataFrame({
    "shipment_id": df_dyn.get("shipment_id", df_dyn.index),
    "origin": df_dyn.get("gps"),
    "destination": df_dyn.get("gps"),
    "dispatch_date": pd.to_datetime(df_dyn.get("timestamp")),
    "delivery_date": pd.NaT,
    "delay_days": df_dyn.get("delivery_time_deviation"),
    "disruption_type": df_dyn.get("risk_classification", "None"),
    "risk_score": df_dyn.get("disruption_likelihood_score", 0),
    "source": "dynamic_logistics"
})


In [4]:
df_integrated = pd.concat([df_res_std, df_us_std, df_smart_std, df_dyn_std], ignore_index=True)

print(f"✅ Integrated dataset shape = {df_integrated.shape}")
df_integrated.head()


✅ Integrated dataset shape = (35065, 9)


Unnamed: 0,shipment_id,origin,destination,dispatch_date,delivery_date,delay_days,disruption_type,risk_score,source
0,O1000,B33,S23,2023-10-27,2023-10-28,0.0,,0.0,resilience
1,O1001,B1,S20,2023-07-08,2023-07-09,0.0,,0.0,resilience
2,O1002,B2,S10,2023-12-29,2024-01-07,7.0,Shortage,1.0,resilience
3,O1003,B6,S10,2023-01-17,2023-01-20,0.0,,0.0,resilience
4,O1004,B5,S4,2023-01-14,2023-01-16,0.0,,0.0,resilience


In [5]:
df_integrated.to_csv(OUTPUT_FILE, index=False)
print(f"💾 Saved unified dataset → {OUTPUT_FILE}")

💾 Saved unified dataset → ../data/processed/tabular/integrated_supply_chain.csv
