# Libraries & Connection

In [2]:
import random
from datetime import date, timedelta
import pandas as pd

from sqlalchemy import create_engine, text
from sqlalchemy.exc import IntegrityError

In [3]:
# SQLAlchemy connection string (inside Docker network)
DB_USER = "analytics_user"
DB_PASS = "analyticspass123"
DB_HOST = "mysql"          # service name from docker-compose
DB_PORT = "3306"
DB_NAME = "supply_chain_db"

connection_string = f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_string)

# Quick test: list tables
with engine.connect() as conn:
    tables = pd.read_sql("SHOW TABLES;", conn)
tables

Unnamed: 0,Tables_in_supply_chain_db
0,inventory
1,orders
2,price_history
3,products
4,sales
5,suppliers
6,warehouses


# Inventory

In [5]:
query = """
SELECT 
    TABLE_NAME,
    TABLE_ROWS,
    ROUND((DATA_LENGTH + INDEX_LENGTH) / 1024 / 1024, 2) AS size_mb,
    ENGINE
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = DATABASE()
ORDER BY size_mb DESC;
"""

with engine.connect() as conn:
    tables_diag = pd.read_sql(query, conn)

tables_diag


Unnamed: 0,TABLE_NAME,TABLE_ROWS,size_mb,ENGINE
0,inventory,21045,3.89,InnoDB
1,sales,50,0.06,InnoDB
2,orders,20,0.05,InnoDB
3,price_history,37,0.05,InnoDB
4,products,15,0.03,InnoDB
5,suppliers,5,0.02,InnoDB
6,warehouses,8,0.02,InnoDB


In [4]:
def random_date(start_date, end_date):
    delta_days = (end_date - start_date).days
    return start_date + timedelta(days=random.randint(0, delta_days))

In [39]:

def generate_inventory_row():
    product_id = random.randint(1, 46)
    warehouse_id = random.randint(1, 18)

    quantity_on_hand = random.randint(10, 999)

    min_reserved = int(quantity_on_hand * 0.10)
    max_reserved = int(quantity_on_hand * 0.90)
    quantity_reserved = random.randint(min_reserved, max_reserved)

    snapshot_date = random_date(
        date(2020, 1, 1),
        date.today()
    )

    return {
        "product_id": product_id,
        "warehouse_id": warehouse_id,
        "quantity_on_hand": quantity_on_hand,
        "quantity_reserved": quantity_reserved,
        "snapshot_date": snapshot_date
    }


In [40]:
insert_stmt = text("""
INSERT INTO inventory (
    product_id,
    warehouse_id,
    quantity_on_hand,
    quantity_reserved,
    snapshot_date
)
VALUES (
    :product_id,
    :warehouse_id,
    :quantity_on_hand,
    :quantity_reserved,
    :snapshot_date
)
""")


In [41]:
rows_to_insert = 900
inserted = 0

with engine.begin() as conn:  # auto-commit / rollback
    for _ in range(rows_to_insert):
        row = generate_inventory_row()
        try:
            conn.execute(insert_stmt, row)
            inserted += 1
        except IntegrityError:
            # Duplicate (product_id, warehouse_id, snapshot_date)
            pass

print(f"Inserted {inserted} rows successfully.")


Inserted 890 rows successfully.


# Orders

In [20]:
def random_date(start_date, end_date):
    delta_days = (end_date - start_date).days
    return start_date + timedelta(days=random.randint(0, delta_days))

In [21]:
def get_next_order_id(engine):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT COALESCE(MAX(order_id), 0) FROM orders")
        ).scalar()
    return result + 1

In [22]:
def generate_order_row(order_id):
    # Order date: 2024-01-01 → today
    order_date = random_date(date(2024, 1, 1), date.today())

    # Supplier constraint
    supplier_id = random.randint(1, 30)

    # Quantity and cost constraints
    order_quantity = random.randint(1, 10_000)
    order_cost = round(random.uniform(2000, 300000), 2)

    # Expected delivery: 30–1000 days after order_date
    expected_delivery_date = order_date + timedelta(
        days=random.randint(30, 1000)
    )

    # Decide delivery state
    delivered = random.choice([True, False])

    if delivered:
        actual_delivery_date = order_date + timedelta(
            days=random.randint(1, 1000)
        )
        delivery_status = "Delivered"
    else:
        actual_delivery_date = None
        delivery_status = random.choice(["Pending", "In Transit"])

    return {
        "order_id": order_id,
        "order_date": order_date,
        "supplier_id": supplier_id,
        "order_quantity": order_quantity,
        "order_cost": order_cost,
        "expected_delivery_date": expected_delivery_date,
        "actual_delivery_date": actual_delivery_date,
        "delivery_status": delivery_status
    }


In [23]:
# INSERT statement
# -----------------------------
insert_stmt = text("""
INSERT INTO orders (
    order_id,
    order_date,
    supplier_id,
    order_quantity,
    order_cost,
    expected_delivery_date,
    actual_delivery_date,
    delivery_status
)
VALUES (
    :order_id,
    :order_date,
    :supplier_id,
    :order_quantity,
    :order_cost,
    :expected_delivery_date,
    :actual_delivery_date,
    :delivery_status
)
""")


In [24]:
# Generate + bulk insert data
# -----------------------------
ROWS_TO_INSERT = 20000

start_order_id = get_next_order_id(engine)

rows = [
    generate_order_row(start_order_id + i)
    for i in range(ROWS_TO_INSERT)
]

with engine.begin() as conn:
    conn.execute(insert_stmt, rows)

In [25]:
# Verify result
# -----------------------------
with engine.connect() as conn:
    df = pd.read_sql("""
        SELECT
            delivery_status,
            COUNT(*) AS total_orders
        FROM orders
        GROUP BY delivery_status
    """, conn)