###Importing the modules

In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

###Reading Return data from silver layer

In [0]:
returns_df = (
    spark.read.table("retail_analytics.silver.transactions")
    .filter(upper(col("transaction_type")) == "RETURN")
)

In [0]:
returns_df.count()

338829

###Creating date_sk from date column

In [0]:
returns_df = returns_df.withColumn(
    "date_sk",
    date_format(to_date(col("date")), "yyyyMMdd").cast("int")
)

###Joining Dimension tables

In [0]:
fact_joined_df = (
    returns_df.alias("r")
    .join(
        spark.table("retail_analytics.gold.dim_customers").alias("c"),
        col("r.customer_id") == col("c.customer_id"),
        "left"
    )
    .join(
        spark.table("retail_analytics.gold.dim_products").alias("p"),
        col("r.product_id") == col("p.product_id"),
        "left"
    )
    .join(
        spark.table("retail_analytics.gold.dim_stores").alias("s"),
        col("r.store_id") == col("s.store_id"),
        "left"
    )
    .join(
        spark.table("retail_analytics.gold.dim_employees").alias("e"),
        col("r.employee_id") == col("e.employee_id"),
        "left"
    )
)

###Selecting the needed columns

In [0]:
fact_returns_df = (
    fact_joined_df.select(
        col("r.date_sk"),
        col("c.customer_sk"),
        col("p.product_sk"),
        col("s.store_sk"),
        col("e.employee_sk"),
        col("r.invoice_id"),
        col("r.line"),
        col("r.size"),
        col("r.quantity").alias("quantity_returned"),
        col("r.line_total").alias("refund_amount"),
        col("r.currency"),
        col("r.date").alias("return_date"),
        current_timestamp().alias("_created_at")
    )
)

###Creating fact_return with sk key

In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS retail_analytics.gold.fact_returns (
    return_sk BIGINT GENERATED ALWAYS AS IDENTITY,
    date_sk INT,
    customer_sk BIGINT,
    product_sk BIGINT,
    store_sk BIGINT,
    employee_sk BIGINT,
    invoice_id STRING,
    line INT,
    size STRING,
    quantity_returned INT,
    refund_amount DOUBLE,
    currency STRING,
    return_date TIMESTAMP,
    _created_at TIMESTAMP
)
USING DELTA
""")

DataFrame[]

###Merge process(SCD-1)

In [0]:
fact_returns_tbl = DeltaTable.forName(
    spark, "retail_analytics.gold.fact_returns"
)

(
    fact_returns_tbl.alias("tgt")
    .merge(
        fact_returns_df.alias("src"),
        """
        tgt.invoice_id = src.invoice_id AND
        tgt.line = src.line AND
        tgt.return_date = src.return_date
        """
    )
    .whenNotMatchedInsert(values={
        "date_sk": col("src.date_sk"),
        "customer_sk": col("src.customer_sk"),
        "product_sk": col("src.product_sk"),
        "store_sk": col("src.store_sk"),
        "employee_sk": col("src.employee_sk"),
        "invoice_id": col("src.invoice_id"),
        "line": col("src.line"),
        "size": col("src.size"),
        "quantity_returned": col("src.quantity_returned"),
        "refund_amount": col("src.refund_amount"),
        "currency": col("src.currency"),
        "return_date": col("src.return_date"),
        "_created_at": col("src._created_at")
    })
    .execute()
)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table("retail_analytics.gold.fact_returns").limit(5).display()

return_sk,date_sk,customer_sk,product_sk,store_sk,employee_sk,invoice_id,line,size,quantity_returned,refund_amount,currency,return_date,_created_at
1,20241228,357384,8788,33,61,RET-US-005-04350283,1,S,1,-21.0,USD,2024-12-28T00:00:00.000Z,2026-01-20T10:37:09.853Z
3,20250126,1191820,2929,33,13,RET-US-005-04355995,3,M,1,-81.0,USD,2025-01-26T00:00:00.000Z,2026-01-20T10:37:09.853Z
5,20230119,5884,3730,15,73,RET-CN-006-02868410,1,M,1,-125.0,CNY,2023-01-19T00:00:00.000Z,2026-01-20T10:37:09.853Z
7,20230326,249322,15617,15,271,RET-CN-006-02882457,5,M,1,-163.0,CNY,2023-03-26T00:00:00.000Z,2026-01-20T10:37:09.853Z
9,20230916,1226113,4247,15,254,RET-CN-006-02920008,2,XL,1,-250.0,CNY,2023-09-16T00:00:00.000Z,2026-01-20T10:37:09.853Z


In [0]:
spark.read.table("retail_analytics.gold.fact_returns").count()

338829