# Gold Dimension: Product (SCD Type 2)
This notebook builds the Product dimension table in the Gold layer using SCD Type 2 logic.

In [None]:
from pyspark.sql.functions import col, lit, coalesce, current_date, sha2, concat_ws
from pyspark.sql.types import StringType, DateType
from delta.tables import DeltaTable

In [None]:
# Configuration
SCHEMA = "default"
SILVER_MASTER_PRODUCTS = f"{SCHEMA}.silver_master_products"
GOLD_DIM_PRODUCT = f"{SCHEMA}.gold_dim_product"

print("Processing Gold Dimension: Product (SCD Type 2)")
print(f"Source: {SILVER_MASTER_PRODUCTS}")
print(f"Target: {GOLD_DIM_PRODUCT}")

In [None]:
# Read Silver

df_silver = spark.read.table(SILVER_MASTER_PRODUCTS)

df_source = (
    df_silver
    .select(
        col("product_code").cast(StringType()).alias("product_code"),
        coalesce(col("product_name"), col("product_code")).alias("product_name"),
        coalesce(col("brand"), lit("UNKNOWN")).alias("brand"),
        coalesce(col("segment"), lit("UNKNOWN")).alias("segment"),
        coalesce(col("category"), lit("UNKNOWN")).alias("category")
    )
    .distinct()
)

# Deterministic surrogate key (version-based)
df_source = df_source.withColumn(
    "product_sk",
    sha2(
        concat_ws(
            "||",
            col("product_code"),
            col("product_name"),
            col("brand"),
            col("segment"),
            col("category")
        ),
        256
    ).substr(1, 16)
)
df_source = (
    df_source
    .withColumn("valid_from", current_date())
    .withColumn("valid_to", lit("9999-12-31").cast(DateType()))
    .withColumn("is_current", lit(True))
)

In [None]:
# Initial load or Incremental SCD Type 2
if not spark.catalog.tableExists(GOLD_DIM_PRODUCT):
    (
        df_source
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(GOLD_DIM_PRODUCT)
    )
    print("Initial load completed")
else:
    delta_table = DeltaTable.forName(spark, GOLD_DIM_PRODUCT)
    change_condition = """
        NOT (
            target.product_name <=> source.product_name AND
            target.brand        <=> source.brand        AND
            target.segment      <=> source.segment      AND
            target.category     <=> source.category
        )
    """
    (
        delta_table.alias("target")
        .merge(
            df_source.alias("source"),
            "target.product_code = source.product_code AND target.is_current = true"
        )
        .whenMatchedUpdate(
            condition=change_condition,
            set={
                "valid_to": current_date(),
                "is_current": lit(False)
            }
        )
        .whenNotMatchedInsert(
            values={
                "product_sk": "source.product_sk",
                "product_code": "source.product_code",
                "product_name": "source.product_name",
                "brand": "source.brand",
                "segment": "source.segment",
                "category": "source.category",
                "valid_from": "source.valid_from",
                "valid_to": "source.valid_to",
                "is_current": "source.is_current"
            }
        )
        .execute()
    )
    print("Incremental SCD2 merge completed")

In [None]:
# Validation
spark.sql(f"""
SELECT
    COUNT(*) AS total_rows,
    SUM(CASE WHEN is_current THEN 1 ELSE 0 END) AS current_rows
FROM {GOLD_DIM_PRODUCT}
""").show()