In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import random
from datetime import datetime, timedelta

spark = SparkSession.builder.getOrCreate()

# --- Generate product data ---
products = [
    ("P001", "Widget A", "Electronics"),
    ("P002", "Widget B", "Electronics"),
    ("P003", "Chair X", "Furniture"),
    ("P004", "Table Z", "Furniture"),
    ("P005", "Sofa L", "Furniture"),
    ("P006", "Monitor 24\"", "Electronics"),
    ("P007", "Laptop Pro", "Electronics"),
    ("P008", "Desk Comfort", "Furniture"),
    ("P009", "Smartwatch G", "Electronics"),
    ("P010", "Headphones H", "Electronics")
]

product_df = spark.createDataFrame(products, ["product_id", "product_name", "category"])
product_df.write.mode("overwrite").parquet("abfss://demo@srinistracc.dfs.core.windows.net/optdemo/productdata.parquet")

# --- Generate order data ---
regions = ["North", "South", "East", "West"]
order_data = []

for i in range(1001, 1056):  # 55 records
    order_id = f"O{i}"
    customer_id = f"C{random.randint(1, 30):03}"
    product = random.choice(products)[0]
    base_date = datetime(2023, 1, 1) + timedelta(days=random.randint(0, 480))  # Between 2023 and 2024
    quantity = random.randint(1, 5)
    price_per_unit = random.choice([20.0, 25.0, 30.0, 35.0])
    total_amount = round(price_per_unit * quantity, 2)
    region = random.choice(regions)

    order_data.append((order_id, customer_id, product, base_date.date(), quantity, total_amount, region))

columns = ["order_id", "customer_id", "product_id", "order_date", "quantity", "total_amount", "region"]
orders_df = spark.createDataFrame(order_data, columns)

orders_df.write.mode("overwrite").parquet("abfss://demo@srinistracc.dfs.core.windows.net/optdemo/ordersdata.parquet")
