----------------------------------------
Notebook: 01_bronze_ingestion  
Layer   : Bronze
Purpose : Ingest raw customer, product, and order data  and write to delta with minimal transformations  
*Only column standardization and ingetsion metadata are applied  
*No business logic or filtering is done at this stage


----------------------------------------


In [0]:
spark.table("customer").printSchema()
spark.table("products").printSchema()
spark.table("orders").printSchema()

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")

Customers Bronze  
*Standardize column names and add ingestion metadata  
*No null handling or deduplication is performed in Bronze

In [0]:
from pyspark.sql.functions import current_date

customers_clean =(
    spark.table("customer")
    .withColumnRenamed("Customer ID", "customer_id")
    .withColumnRenamed("Customer Name", "customer_name")
    .withColumnRenamed("Postal Code", "postal_code")
    .withColumnRenamed("Country", "country")
    .withColumnRenamed("City", "city")
    .withColumnRenamed("State", "state")
    .withColumnRenamed("Region", "region")
    .withColumn("ingestion_date", current_date())
)


customers_clean.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze.customers")


products bronze  
*Preserve raw product attributes with naming standard only

In [0]:
products_clean = (
    spark.table("products")
    .withColumnRenamed("Product ID", "product_id")
    .withColumnRenamed("Product Name", "product_name")
    .withColumnRenamed("Sub-Category", "sub_category")
    .withColumnRenamed("Price per product", "price_per_product")
    .withColumn("ingestion_date", current_date())
)

products_clean.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze.products")


Orders Bronze  
*Orders are treated as immutable events; no filtering or deduplication is done here

In [0]:
orders_clean = (
    spark.table("orders")
    .withColumnRenamed("Order ID", "order_id")
    .withColumnRenamed("Customer ID", "customer_id")
    .withColumnRenamed("Product ID", "product_id")
    .withColumnRenamed("Order Date", "order_date")
    .withColumnRenamed("Ship Date", "ship_date")
    .withColumnRenamed("Ship Mode", "ship_mode")
    .withColumnRenamed("Row ID", "row_id")
    .withColumn("ingestion_date", current_date())
)

orders_clean.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze.orders")

In [0]:
#for validating bronze schema

spark.table("bronze.customers").printSchema()
spark.table("bronze.products").printSchema()
spark.table("bronze.orders").printSchema()

In [0]:
assert spark.table("customer").count() == spark.table("bronze.customers").count(), \
 "Mismatch between source customer rows and bronze customers table"
assert spark.table("products").count() == spark.table("bronze.products").count(), \
 "Mismatch between source products rows and bronze products table"
assert spark.table("orders").count() == spark.table("bronze.orders").count(), \
    "Mismatch between source orders rows and bronze orders table"
