In [0]:
# Silver Transformations: Clean and Enrich
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, to_timestamp, lit, coalesce
from delta.tables import DeltaTable

spark = SparkSession.builder.appName("SilverTransform").getOrCreate()

# Load Bronze from catalog tables
customers_bronze = spark.read.table("workspace.bronze.customers")
pos_sales_bronze = spark.read.table("workspace.bronze.pos_sales")
products_bronze = spark.read.table("workspace.bronze.products")
promotions_bronze = spark.read.table("workspace.bronze.promotions")
stores_bronze = spark.read.table("workspace.bronze.stores")




In [0]:
# Silver Customers: Clean nulls, data types
customers_silver = customers_bronze.select(
    col("customer_id"),
    col("first_name"),
    col("last_name"),
    col("email"),
    coalesce(col("loyalty_status"), lit("None")).alias("loyalty_status"),
    to_timestamp(col("signup_date")).alias("signup_date")
).dropDuplicates(["customer_id"])  # Dedup by ID



In [0]:
# Silver POS Sales: Clean, derive total_amount, handle null promotions
pos_sales_silver = pos_sales_bronze.select(
    col("transaction_id"),
    to_timestamp(col("transaction_date")).alias("transaction_date"),
    col("product_id"),
    col("customer_id"),
    col("store_id"),
    when(col("quantity") > 0, col("quantity")).otherwise(1).alias("quantity"),  # Validate quantity
    col("unit_price").cast("double"),
    coalesce(col("promotion_id"), lit("None")).alias("promotion_id")
).withColumn("total_amount", col("quantity") * col("unit_price"))  # Derived column
pos_sales_silver = pos_sales_silver.filter(col("unit_price") > 0)  # Validation



In [0]:
# Similar for others (simplified)
products_silver = products_bronze.dropDuplicates(["product_id"])
promotions_silver = promotions_bronze.withColumn("start_date", to_timestamp(col("start_date"))).withColumn("end_date", to_timestamp(col("end_date")))
stores_silver = stores_bronze.withColumn("opened_date", to_timestamp(col("opened_date"))).dropDuplicates(["store_id"])



In [0]:
# Write to Silver as managed tables (partition by date for sales)
spark.sql("CREATE DATABASE IF NOT EXISTS silver")
customers_silver.write.format("delta").mode("overwrite").saveAsTable("silver.customers")
pos_sales_silver.write.format("delta").mode("overwrite").partitionBy("transaction_date").saveAsTable("silver.pos_sales")
products_silver.write.format("delta").mode("overwrite").saveAsTable("silver.products")
promotions_silver.write.format("delta").mode("overwrite").saveAsTable("silver.promotions")
stores_silver.write.format("delta").mode("overwrite").saveAsTable("silver.stores")

# Optimize (using table names)
DeltaTable.forName(spark, "silver.pos_sales").optimize().executeCompaction()

print("Silver tables created successfully.")