In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, lit, current_timestamp

In [0]:
@dlt.view(
    name="business_cdf_stage_v",
    comment="Stage view to feed SCD2 Business dimension using CDF from gold_inspection."
)
def business_cdf_stage_v():
    df = (
        spark.readStream
             .format("delta")
             .option("readChangeFeed", "true")
             .table("workspace.`damg-midterm`.gold_inspection")
    )

    df = (
        df.select(
            F.col("business_name").alias("Business_Name"),
            F.col("aka_business_name").alias("AKA_Business_Name"),
            F.col("license_number").cast("string").alias("License_Number"),
            F.col("facility_type").alias("Facility_Type"),
            # dataset city: CHICAGO / DALLAS from gold
            F.col("city").alias("Dataset_City"),
            F.col("_change_type"),
            F.col("_commit_timestamp").alias("start_dt")
        )
        .where(F.col("Business_Name").isNotNull())
        .withColumn(
            "Business_Natural_Key",
            F.when(
                (F.col("License_Number").isNotNull()) & (F.col("License_Number") != "0"),
                # Chicago → use license directly (already numeric)
                F.col("License_Number").cast("bigint")
            ).otherwise(
                # Dallas → hash(name + city) reduced to <= 7 digits
                (
                    F.abs(
                        F.hash(
                            F.upper(F.trim(F.col("Business_Name"))),
                            F.upper(F.trim(F.col("Dataset_City")))
                        )
                    ) % 10000000
                ).cast("int")
            )
        )
        # optional: de-dupe so we don't send multiple rows per business per commit
        .dropDuplicates(["Business_Natural_Key", "start_dt", "_change_type"])
        .withColumn("load_dt", F.current_timestamp())
    )

    return df

In [0]:
# SCD2 storage table (no SK yet)
dlt.create_streaming_table("dim_business_scd2")

dlt.apply_changes(
    target               = "dim_business_scd2",
    source               = "business_cdf_stage_v",
    keys                 = ["Business_Natural_Key"],
    sequence_by          = col("start_dt"),
    apply_as_deletes     = (col("_change_type") == lit("delete")),
    ignore_null_updates  = True,
    stored_as_scd_type   = 2,
    except_column_list   = ["_change_type", "start_dt", "load_dt"]
)


In [0]:
@dlt.table(
    name="dim_business",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_business():
    src = dlt.read("dim_business_scd2")

    # Order all rows in a deterministic way
    w = Window.orderBy("Business_Natural_Key", "__START_AT")

    return (
        src.select(
            # SK: 1,2,3,... based on ordering
            F.row_number().over(w).cast("bigint").alias("Business_Key"),

            # Natural key (numeric, as we defined)
            F.col("Business_Natural_Key"),

            # Attributes
            F.col("Business_Name"),
            F.col("AKA_Business_Name"),
            #F.col("License_Number"),
            F.col("Facility_Type"),

            # SCD2 meta
            F.col("__START_AT").alias("Effective_Start"),
            F.col("__END_AT").alias("Effective_End"),
            F.when(F.col("__END_AT").isNull(), F.lit(1)).otherwise(F.lit(0)).alias("Is_Active"),

            F.lit("FOOD_INSPECTIONS").alias("Source_System")
        )
    )
