In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
import random

# -------------------------
# REPRODUCIBILITY
# -------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# -------------------------
# CONFIG (tune anytime)
# -------------------------
N_ORDERS = 5000
DATE_START = pd.Timestamp("2024-01-01")
DATE_DAYS = 120  # 4 months

SKUS = [f"SKU-{i:03d}" for i in range(1, 201)]  # 200 SKUs
CATEGORIES = ["Electronics", "Home", "Fashion", "Beauty", "Sports"]

WAREHOUSES = ["WH-JHB", "WH-CPT", "WH-DBN"]  # Johannesburg, Cape Town, Durban
CARRIERS = ["FastTrack", "QuickShip", "EconoMove"]

REGIONS = ["Gauteng", "Western Cape", "KwaZulu-Natal", "Limpopo", "Mpumalanga", "Free State"]
CHANNELS = ["Web", "Mobile"]

# Realistic carrier speed profiles (days in transit)
CARRIER_TRANSIT = {
    "FastTrack": (1, 3),
    "QuickShip": (2, 4),
    "EconoMove": (3, 6),
}

# Warehouses have different processing times (order -> ship)
WAREHOUSE_PROCESSING = {
    "WH-JHB": (0, 2),
    "WH-CPT": (1, 3),
    "WH-DBN": (1, 4),
}

# Some lanes are naturally harder (region distance effect)
REGION_DIFFICULTY = {
    "Gauteng": 0,
    "Mpumalanga": 0,
    "Free State": 0,
    "KwaZulu-Natal": 1,
    "Western Cape": 1,
    "Limpopo": 1,
}

# -------------------------
# HELPERS
# -------------------------
def random_date(start, days):
    return start + pd.to_timedelta(np.random.randint(0, days), unit="D")

def clamp_int(x, low=0):
    return int(max(low, x))

# -------------------------
# 1) ORDERS (1 row per order)
# -------------------------
order_ids = [f"O-{100000+i}" for i in range(N_ORDERS)]

orders_rows = []
order_meta = {}  # store per-order chosen warehouse/carrier/lines for consistent generation

for oid in order_ids:
    order_date = random_date(DATE_START, DATE_DAYS)

    region = random.choice(REGIONS)
    channel = random.choice(CHANNELS)
    priority = np.random.choice(["Normal", "Express"], p=[0.75, 0.25])

    # Choose a warehouse (regional bias could be added; keep simple but realistic)
    warehouse = random.choice(WAREHOUSES)

    # Choose carrier
    carrier = np.random.choice(CARRIERS, p=[0.35, 0.40, 0.25])

    # Promised delivery: depends on priority + typical transit + processing
    wh_proc_min, wh_proc_max = WAREHOUSE_PROCESSING[warehouse]
    tr_min, tr_max = CARRIER_TRANSIT[carrier]

    base_proc = np.random.randint(wh_proc_min, wh_proc_max + 1)
    base_transit = np.random.randint(tr_min, tr_max + 1)

    # Express promises tighter
    promise_buffer = 1 if priority == "Express" else 2
    promised_delivery_date = order_date + timedelta(days=base_proc + base_transit + promise_buffer)

    orders_rows.append({
        "order_id": oid,
        "order_date": order_date.date().isoformat(),
        "promised_delivery_date": promised_delivery_date.date().isoformat(),
        "customer_region": region,
        "order_priority": priority,
        "order_channel": channel,
        "order_status": "Completed"  # full clean dataset: no cancelled
    })

    order_meta[oid] = {
        "warehouse": warehouse,
        "carrier": carrier,
        "order_date": order_date,
        "promised_delivery_date": promised_delivery_date,
        "priority": priority,
        "region": region
    }

orders = pd.DataFrame(orders_rows)

# -------------------------
# 2) ORDER LINES (1 row per order_id + sku)
# -------------------------
order_lines_rows = []
order_sku_map = {}

for oid in order_ids:
    n_lines = np.random.randint(1, 5)  # 1–4 SKUs
    skus_for_order = random.sample(SKUS, k=n_lines)

    order_sku_map[oid] = []

    for sku in skus_for_order:
        category = random.choice(CATEGORIES)

        # realistic e-comm quantities: mostly 1–2
        ordered_qty = np.random.choice([1, 2, 3, 4], p=[0.60, 0.25, 0.10, 0.05])

        # price distributions vary by category
        if category == "Electronics":
            unit_price = round(np.random.uniform(300, 5000), 2)
        elif category == "Fashion":
            unit_price = round(np.random.uniform(80, 1500), 2)
        else:
            unit_price = round(np.random.uniform(50, 2500), 2)

        order_lines_rows.append({
            "order_id": oid,
            "sku": sku,
            "product_category": category,
            "ordered_qty": ordered_qty,
            "unit_price": unit_price
        })

        order_sku_map[oid].append((sku, ordered_qty, category, unit_price))

order_lines = pd.DataFrame(order_lines_rows)

# -------------------------
# 3) SHIPMENTS & SHIPMENT LINES (consistent, realistic)
# -------------------------
shipments_rows = []
shipment_lines_rows = []

shipment_counter = 1

for oid in order_ids:
    meta = order_meta[oid]
    warehouse = meta["warehouse"]
    carrier = meta["carrier"]
    order_date = meta["order_date"]
    promised = meta["promised_delivery_date"]
    priority = meta["priority"]
    region = meta["region"]

    # split shipments: 70% single, 30% split into 2
    n_ship = np.random.choice([1, 2], p=[0.70, 0.30])

    # expedite flag: more common for express
    expedite_flag = int(np.random.rand() < (0.25 if priority == "Express" else 0.08))

    # allocate SKUs into shipments
    skus = [x[0] for x in order_sku_map[oid]]
    random.shuffle(skus)
    sku_chunks = np.array_split(skus, n_ship)

    # processing time
    wh_proc_min, wh_proc_max = WAREHOUSE_PROCESSING[warehouse]
    proc_days = np.random.randint(wh_proc_min, wh_proc_max + 1)
    if expedite_flag == 1:
        proc_days = clamp_int(proc_days - 1, 0)

    # carrier transit
    tr_min, tr_max = CARRIER_TRANSIT[carrier]
    transit_days = np.random.randint(tr_min, tr_max + 1)
    if expedite_flag == 1:
        transit_days = clamp_int(transit_days - 1, 1)

    # region difficulty increases lateness risk
    difficulty = REGION_DIFFICULTY.get(region, 0)

    # lateness probability (clean but realistic)
    base_late_prob = 0.12 + 0.06 * difficulty
    late_days = 0
    if np.random.rand() < base_late_prob:
        late_days = np.random.choice([1, 2, 3], p=[0.55, 0.30, 0.15])

    # ship date is order_date + processing
    ship_date = order_date + timedelta(days=proc_days)

    # delivery date base (ship + transit + possible late)
    # if split shipment: different chunk can arrive different day (simulate variability)
    for chunk in sku_chunks:
        shipment_id = f"S-{shipment_counter}"
        shipment_counter += 1

        shipments_rows.append({
            "shipment_id": shipment_id,
            "order_id": oid,
            "warehouse": warehouse,
            "carrier": carrier,
            "ship_date": ship_date.date().isoformat(),
            "expedite_flag": expedite_flag
        })

        # each shipment may vary delivery by +/-1 day
        variability = np.random.choice([-1, 0, 1], p=[0.15, 0.70, 0.15])
        delivery_offset_days = int(transit_days + late_days + variability)
        delivered_date = ship_date + timedelta(days=delivery_offset_days)
        # ensure not before ship_date
        if delivered_date < ship_date:
            delivered_date = ship_date

        for sku in chunk:
            ordered_qty = int([x[1] for x in order_sku_map[oid] if x[0] == sku][0])

            # in-full probability (some partials)
            partial_prob = 0.10 + (0.05 if difficulty == 1 else 0.0)  # tougher regions slightly worse
            if np.random.rand() < partial_prob:
                delivered_qty = clamp_int(ordered_qty - 1, 0)
            else:
                delivered_qty = ordered_qty

            shipped_qty = ordered_qty  # clean: shipped equals ordered (still can deliver partial due to loss/damage)

            damaged_flag = int(np.random.rand() < 0.04)
            returned_flag = int(np.random.rand() < 0.08)

            shipment_lines_rows.append({
                "shipment_id": shipment_id,
                "sku": sku,
                "shipped_qty": shipped_qty,
                "delivered_qty": delivered_qty,
                "delivered_date": delivered_date.date().isoformat(),
                "damaged_flag": damaged_flag,
                "returned_flag": returned_flag
            })

shipments = pd.DataFrame(shipments_rows)
shipment_lines = pd.DataFrame(shipment_lines_rows)

# -------------------------
# SAVE CLEAN RAW FILES
# -------------------------
orders.to_csv("orders.csv", index=False)
order_lines.to_csv("order_lines.csv", index=False)
shipments.to_csv("shipments.csv", index=False)
shipment_lines.to_csv("shipment_lines.csv", index=False)

print("✅ Generated CLEAN raw datasets:")
print(" - orders.csv")
print(" - order_lines.csv")
print(" - shipments.csv")
print(" - shipment_lines.csv")

print("\nQuick counts:")
print("orders:", len(orders))
print("order_lines:", len(order_lines))
print("shipments:", len(shipments))
print("shipment_lines:", len(shipment_lines))


✅ Generated CLEAN raw datasets:
 - orders.csv
 - order_lines.csv
 - shipments.csv
 - shipment_lines.csv

Quick counts:
orders: 5000
order_lines: 12486
shipments: 6569
shipment_lines: 12486
