In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window


In [0]:
df_customer_bronze = spark.table("retailer_sales.bronze.customer_data_raw")
df_product_bronze  = spark.table("retailer_sales.bronze.product_data_raw")
df_sales_bronze    = spark.table("retailer_sales.bronze.sales_returns_raw")
df_payment_bronze  = spark.table("retailer_sales.bronze.card_payment_refund_raw")


In [0]:
window_spec = Window.partitionBy("customer_id").orderBy(col("ingestion_ts").desc())

df_dim_customer = (
    df_customer_bronze
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn")
    .withColumn("region", upper(trim(col("region"))))
    .withColumn("city", initcap(trim(col("city"))))
    .withColumn("is_active", lit(True))
    .select(
        "customer_id",
        "customer_name",
        "email",
        "phone",
        "gender",
        "age",
        "region",
        "city",
        "signup_date",
        "is_active",
        current_timestamp().alias("record_created_ts")
    )
)
df_dim_customer.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.dim_customer")


In [0]:
df_dim_product = (
    df_product_bronze
    .withColumn("profit_per_unit", col("selling_price") - col("cost_price"))
    .select(
        "product_id",
        "product_name",
        "sku",
        "category",
        "supplier",
        "cost_price",
        "selling_price",
        "profit_per_unit",
        current_timestamp().alias("record_created_ts")
    )
)
df_dim_product.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.dim_product")


In [0]:
df_dim_date = (
    df_sales_bronze
    .select(col("order_date").alias("date"))
    .distinct()
    .withColumn("year", year("date"))
    .withColumn("quarter", concat(lit("Q"), quarter("date")))
    .withColumn("month", month("date"))
    .withColumn("month_name", date_format("date", "MMMM"))
)
df_dim_date.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.dim_date")


In [0]:
df_fact_sales = (
    df_sales_bronze
    .filter(
        (col("order_status") == "COMPLETED") &
        (col("return_flag") == "N")
    )
    .select(
        "order_id",
        "order_date",
        "customer_id",
        "product_id",
        "quantity",
        "unit_price",
        "total_amount",
        "region",
        "payment_mode"
    )
)
df_fact_sales.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.fact_sales")


In [0]:
df_fact_returns = (
    df_sales_bronze
    .filter(col("return_flag") == "Y")
    .join(
        df_payment_bronze.filter(col("transaction_type") == "REFUND"),
        on="order_id",
        how="left"
    )
    .select(
        col("order_id"),
        col("transaction_date").alias("return_date"),
        col("customer_id"),
        col("product_id"),
        col("amount").alias("refund_amount"),
        col("region")
    )
)
df_fact_returns.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.fact_returns")


In [0]:
df_fact_payments = (
    df_payment_bronze
    .select(
        "transaction_id",
        "order_id",
        "transaction_type",
        "amount",
        "transaction_date",
        "payment_mode"
    )
)
df_fact_payments.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("retailer_sales.silver.fact_payments")


In [0]:
spark.sql("SELECT COUNT(*) FROM retailer_sales.silver.dim_customer").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.silver.dim_product").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.silver.fact_sales").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.silver.fact_returns").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.silver.fact_payments").show()
