In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import timedelta
import random
import uuid

fake = Faker()
num_customers = 100_000
num_orders = 1_000_000

# 1. Generate Customers
def generate_customers(n):
    customer_ids = [f"CUST{str(i).zfill(6)}" for i in range(n)]
    data = {
        "customer_id": customer_ids,
        "name": [fake.name() for _ in range(n)],
        "email": [fake.email() for _ in range(n)],
        "phone": [fake.phone_number() for _ in range(n)],
    }
    return pd.DataFrame(data)

customers_df = generate_customers(num_customers)
customers_df.to_csv("customers.csv", index=False)

# 2. Generate Orders
def generate_orders(n, customer_ids):
    order_ids = [f"ORD{str(i).zfill(7)}" for i in range(n)]
    data = {
        "order_id": order_ids,
        "customer_id": np.random.choice(customer_ids, size=n),
        "order_date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(n)],
        "channel": np.random.choice(["Online", "Direct Sales", "Mobile App"], size=n),
    }
    return pd.DataFrame(data)

orders_df = generate_orders(num_orders, customers_df["customer_id"].tolist())
orders_df.to_csv("orders.csv", index=False)

# 3. Order Processing
def generate_order_processing(order_ids):
    data = {
        "order_id": order_ids,
        "verified": np.random.choice([True, False], size=len(order_ids), p=[0.98, 0.02]),
        "inventory_checked": np.random.choice([True, False], size=len(order_ids), p=[0.99, 0.01]),
        "processed_date": [fake.date_between(start_date="-11mo", end_date="today") for _ in range(len(order_ids))],
    }
    return pd.DataFrame(data)

order_processing_df = generate_order_processing(orders_df["order_id"].tolist())
order_processing_df.to_csv("order_processing.csv", index=False)

# 4. Packaging
def generate_packaging(order_ids):
    data = {
        "order_id": order_ids,
        "package_type": np.random.choice(["Box", "Envelope", "Crate"], size=len(order_ids)),
        "fragile": np.random.choice([True, False], size=len(order_ids)),
        "packed_date": [fake.date_between(start_date="-10mo", end_date="today") for _ in range(len(order_ids))],
    }
    return pd.DataFrame(data)

packaging_df = generate_packaging(orders_df["order_id"].tolist())
packaging_df.to_csv("packaging.csv", index=False)

# 5. Labeling
def generate_labeling(order_ids):
    tracking_ids = [str(uuid.uuid4())[:12].upper() for _ in range(len(order_ids))]
    data = {
        "order_id": order_ids,
        "recipient_address": [fake.address().replace("\n", ", ") for _ in range(len(order_ids))],
        "tracking_number": tracking_ids,
        "special_notes": np.random.choice(["", "Handle with care", "Leave at door", "Fragile"], size=len(order_ids)),
    }
    return pd.DataFrame(data)

labeling_df = generate_labeling(orders_df["order_id"].tolist())
labeling_df.to_csv("labeling.csv", index=False)

# 6. Carrier Selection
def generate_carriers(order_ids):
    data = {
        "order_id": order_ids,
        "carrier_name": np.random.choice(["UPS", "FedEx", "DHL", "Aramex"], size=len(order_ids)),
        "shipping_cost": np.round(np.random.uniform(5, 50, size=len(order_ids)), 2),
        "estimated_days": np.random.randint(1, 10, size=len(order_ids)),
    }
    return pd.DataFrame(data)

carrier_df = generate_carriers(orders_df["order_id"].tolist())
carrier_df.to_csv("carrier_selection.csv", index=False)

# 7. Shipment Tracking
def generate_tracking(tracking_numbers):
    data = {
        "tracking_number": tracking_numbers,
        "status": np.random.choice(["Shipped", "In Transit", "Delivered", "Delayed"], size=len(tracking_numbers)),
        "last_updated": [fake.date_time_this_year() for _ in range(len(tracking_numbers))],
        "location": [fake.city() for _ in range(len(tracking_numbers))],
    }
    return pd.DataFrame(data)

tracking_df = generate_tracking(labeling_df["tracking_number"].tolist())
tracking_df.to_csv("shipment_tracking.csv", index=False)

# 8. Delivery
def generate_delivery(order_ids):
    data = {
        "order_id": order_ids,
        "delivered_date": [fake.date_between(start_date="-6mo", end_date="today") for _ in range(len(order_ids))],
        "delivery_status": np.random.choice(["Delivered", "Pending", "Failed", "Returned"], size=len(order_ids)),
    }
    return pd.DataFrame(data)

delivery_df = generate_delivery(orders_df["order_id"].tolist())
delivery_df.to_csv("delivery.csv", index=False)

print("✅ Generated 1M+ rows across 8 CSVs successfully.")


✅ Generated 1M+ rows across 8 CSVs successfully.
