# Gold Dimension: PDV (SCD Type 2)
This notebook builds the PDV (Point of Sale) dimension table in the Gold layer using SCD Type 2 logic.

- Business key: code_eleader
- PO code: code_po (informational)
- SCD2 logic: pdv_sk, valid_from, valid_to, is_current
- All columns mapped from Silver


In [None]:
from pyspark.sql.functions import col, lit, current_date, sha2, concat_ws
from pyspark.sql.types import StringType, DateType
from delta.tables import DeltaTable

In [None]:
# Configuration
CATALOG = "workspace"
SCHEMA = "default"
SILVER_SCHEMA = "default"
SILVER_MASTER_PDV = f"{SILVER_SCHEMA}.silver_master_pdv"
GOLD_DIM_PDV = f"{CATALOG}.{SCHEMA}.gold_dim_pdv"

spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
spark.sql(f"USE SCHEMA {SCHEMA}")

print("Processing Gold Dimension: PDV (SCD Type 2)")
print(f"Source: {SILVER_MASTER_PDV}")
print(f"Target: {GOLD_DIM_PDV}")

In [None]:
# Read Silver
df_silver = spark.read.table(SILVER_MASTER_PDV)

# Select and standardize columns for SCD2
df_source = (
    df_silver
    .select(
        col("code_eleader").cast(StringType()).alias("code_eleader"),
        col("code_po").cast(StringType()).alias("code_po"),
        col("store_name"),
        col("channel"),
        col("sub_channel"),
        col("chain"),
        col("neighborhood"),
        col("city"),
        col("parish"),
        col("country"),
        col("type_of_service"),
        col("status"),
        col("supervisor_code"),
        col("supervisor_name"),
        col("merchandiser_code"),
        col("merchandiser_name"),
        col("aditional_exhibitions"),
        col("commercial_activities"),
        col("planograms"),
        col("store_sap_code"),
        col("sales_rep"),
        col("latitude"),
        col("longitude")
    )
    .distinct()
)

# Deterministic surrogate key (SCD2 hash)
scd2_cols = [
    "code_eleader", "code_po", "store_name", "channel", "sub_channel", "chain",
    "neighborhood", "city", "parish", "country", "type_of_service", "status",
    "supervisor_code", "supervisor_name", "merchandiser_code", "merchandiser_name",
    "aditional_exhibitions", "commercial_activities", "planograms", "store_sap_code",
    "sales_rep", "latitude", "longitude"
]

df_source = df_source.withColumn(
    "pdv_sk",
    sha2(concat_ws("||", *[col(c).cast(StringType()) for c in scd2_cols]), 256).substr(1, 16)
).withColumn("valid_from", current_date()
).withColumn("valid_to", lit("9999-12-31").cast(DateType())
).withColumn("is_current", lit(True))

# Reorder columns
final_cols = ["pdv_sk"] + scd2_cols + ["valid_from", "valid_to", "is_current"]
df_source = df_source.select(final_cols)

In [None]:
# SCD2 Merge Logic
if not spark.catalog.tableExists(GOLD_DIM_PDV):
    (
        df_source
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(GOLD_DIM_PDV)
    )
    print("Initial load completed")
else:
    delta_table = DeltaTable.forName(spark, GOLD_DIM_PDV)
    change_condition = " OR ".join([f"target.{c} <> source.{c}" for c in scd2_cols])
    (
        delta_table.alias("target")
        .merge(
            df_source.alias("source"),
            "target.code_eleader = source.code_eleader AND target.is_current = true"
        )
        .whenMatchedUpdate(
            condition=change_condition,
            set={
                "valid_to": current_date(),
                "is_current": lit(False)
            }
        )
        .whenNotMatchedInsert(
            values={c: f"source.{c}" for c in final_cols}
        )
        .execute()
    )
    print("Incremental SCD2 merge completed")

In [None]:
# Validation
spark.sql(f"""
SELECT
    COUNT(*) AS total_rows,
    SUM(CASE WHEN is_current THEN 1 ELSE 0 END) AS current_rows
FROM {GOLD_DIM_PDV}
""").show()