In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit


@dlt.view(
    name="business_cdf_stage_v",
    comment="Stage view to feed SCD2 Business dimension using CDF from gold_inspection."
)
def business_cdf_stage_v():
    df = (
        spark.readStream
             .format("delta")
             .option("readChangeFeed", "true")
             .table("workspace.`damg-midterm`.gold_inspection")
    )

    df = (
        df.select(
            F.col("business_name").alias("Business_Name"),
            F.col("aka_business_name").alias("AKA_Business_Name"),
            F.col("license_number").cast("string").alias("License_Number"),
            F.col("facility_type").alias("Facility_Type"),
            F.col("src_city").alias("City"),
            F.col("state").alias("State"),
            F.col("zip_code").alias("Zip_Code"),
            F.col("location_address").alias("Street_Address"),
            F.col("city").alias("Dataset_City"),
            F.col("_change_type"),
            F.col("_commit_timestamp").alias("start_dt")
        )
        .where(F.col("Business_Name").isNotNull())
        .withColumn(
            "Business_Natural_Key",
            F.when(
                F.col("License_Number").isNotNull() & (F.col("License_Number") != "0"),
                F.concat(F.lit("LIC|"), F.col("License_Number"))
            ).otherwise(
                F.concat(
                    F.lit("BUS|"),
                    F.coalesce(F.col("Business_Name"), F.lit("")),
                    F.lit("|"), F.coalesce(F.col("City"), F.lit("")),
                    F.lit("|"), F.coalesce(F.col("Street_Address"), F.lit("")),
                    F.lit("|"), F.coalesce(F.col("Zip_Code"), F.lit(""))
                )
            )
        )
        .withColumn("load_dt", F.current_timestamp())
    )

    return df



dlt.create_streaming_table("dim_business")

dlt.apply_changes(
    target               = "dim_business",
    source               = "business_cdf_stage_v",
    keys                 = ["Business_Natural_Key"],
    sequence_by          = col("start_dt"),
    apply_as_deletes     = (col("_change_type") == lit("delete")),
    ignore_null_updates  = True,
    stored_as_scd_type   = 2,
    except_column_list   = [
        "_change_type",
        "start_dt",
        "load_dt"
    ]
)


@dlt.table(
    name="dim_business_presentation",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_business_presentation():
    src = dlt.read("dim_business")

    return (
        src.select(
            # Surrogate key per SCD2 version
            F.abs(F.hash("Business_Natural_Key", "__START_AT")).cast("bigint").alias("Business_Key"),
            # Do NOT expose Business_Natural_Key here
            "Business_Name",
            "AKA_Business_Name",
            "License_Number",
            "Facility_Type",
            "City",
            "State",
            "Zip_Code",
            "Street_Address",
            "Dataset_City",
            F.col("__START_AT").alias("Effective_Start"),
            F.col("__END_AT").alias("Effective_End"),
            F.when(F.col("__END_AT").isNull(), F.lit(1)).otherwise(F.lit(0)).alias("Is_Current"),
            F.lit("FOOD_INSPECTIONS").alias("Source_System")
        )
    )
