Import Libs

In [None]:
from pyspark.sql import functions as F


BASIC CONFIG

In [None]:
# Create & use a fresh DB so we avoid old schema conflicts
spark.sql("CREATE DATABASE IF NOT EXISTS retail_raw")
spark.sql("USE retail_raw")

# Sizes / knobs you can tweak
num_products   = 500      # number of products
num_customers  = 10000    # number of customers
num_stores     = 25       # number of stores
num_staff      = 300      # number of staff
num_pos_sales  = 300_000  # POS rows
num_online     = 120_000  # Online rows

start_date = "2022-01-01"
end_date   = "2025-10-31"

1. PRODUCT MASTER (pim_product_raw)

In [None]:
products_df = (
    spark.range(1, num_products + 1)
    .withColumn("product_id", F.col("id").cast("int"))
    .withColumn(
        "product_sku",
        F.concat(F.lit("SKU"), F.lpad(F.col("id").cast("string"), 6, "0"))
    )
    .withColumn("product_name", F.concat(F.lit("Product "), F.col("id")))
    .withColumn(
        "category",
        F.expr("""
            element_at(
                array('AUTOMOTIVE','FASHION','HOME_FURNISHING',
                      'ELECTRONICS','GROCERY','HEALTHCARE'),
                cast(rand()*6 + 1 as int)
            )
        """)
    )
    .withColumn(
        "brand",
        F.expr("""
            element_at(
                array('BRAND_A','BRAND_B','BRAND_C',
                      'BRAND_D','BRAND_E','BRAND_F'),
                cast(rand()*6 + 1 as int)
            )
        """)
    )
    .withColumn(
        "regular_price",
        (F.rand() * (500 - 5) + 5).cast("decimal(10,2)")
    )
    .withColumn(
        "cost_price",
        (F.col("regular_price") * (F.lit(0.6) + F.rand() * 0.2)).cast("decimal(10,2)")
    )
    .withColumn(
        "uom",
        F.expr("element_at(array('EA','BOX','PACK'), cast(rand()*3 + 1 as int))")
    )
    .withColumn(
        "is_active",
        (F.rand() > 0.05)
    )
    .withColumn("created_at_utc", F.current_timestamp())
    .drop("id")
)

products_df.write.mode("overwrite").format("delta").saveAsTable("pim_product_raw")