In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
# -------- 1) Chicago canonical map (tiny table) --------
@dlt.table
def gold_violation_map():
    chi = (
        dlt.read_stream("workspace.`damg-midterm`.chicago_silver_exploded")
        .select(
            F.col("violation_code").cast("string").alias("violation_code"),
            F.upper(F.trim(F.col("violation_description"))).alias("violation_desc_norm"),
            F.col("load_dt").cast("timestamp").alias("load_dt")
        )
        .where(F.col("violation_code").isNotNull())
        .withWatermark("load_dt", "30 days")   # <-- required for streaming agg
    )

    # For each code, keep the most recent (by load_dt) description
    latest = (
        chi.groupBy("violation_code")
           .agg(F.max(F.struct("load_dt", "violation_desc_norm")).alias("s"))
           .select(
               "violation_code",
               F.col("s.violation_desc_norm").alias("chicago_violation_desc")
           )
    )
    return latest

In [0]:

# -------- 2) Union the two silvers into one base (VIEW) --------
@dlt.view
def gold_inspection_base():
    # CHICAGO: columns per your chicago_silver_exploded
    chi_src = dlt.read_stream("workspace.`damg-midterm`.chicago_silver_exploded").withColumn("city", F.lit("CHICAGO"))
    chi_base = chi_src.select(
        F.col("city"),  # lineage city (dataset)
        F.col("inspection_id").cast("string").alias("inspection_id_src"),
        F.col("business_name"),
        F.col("aka_name").alias("aka_business_name"),
        F.col("license_number"),
        F.col("facility_type"),
        F.col("risk_desc").alias("risk_category"),
        F.col("risk_level").cast("int").alias("risk_level_num"),
        F.col("address").alias("location_address"),
        F.col("city").alias("src_city"),               # address-level city from source row
        F.col("state"),
        F.col("zip").cast("string").alias("zip_code"),
        F.col("inspection_date"),
        F.col("inspection_type"),
        F.col("results").alias("result_desc"),
        F.col("violation_code").cast("string").alias("violation_code"),
        F.upper(F.trim(F.col("violation_description"))).alias("violation_desc_raw"),
        F.col("violation_comments"),
        F.col("latitude"),
        F.col("longitude"),
        F.col("location"),
        F.col("score").cast("int").alias("score"),
        F.lit("FOOD_INSPECTIONS").alias("source_system"),
        F.col("load_dt"), F.col("source_file_path"), F.col("source_file_name")
    )

    # DALLAS: different field names -> map to the same output columns
    dal_src = dlt.read_stream("workspace.`damg-midterm`.dallas_silver_exploded").withColumn("city", F.lit("DALLAS"))
    dal_base = dal_src.select(
        F.col("city"),
        F.col("inspection_id").cast("string").alias("inspection_id_src"),
        F.col("business_name"),
        F.lit(None).cast("string").alias("aka_business_name"),      # Dallas has no aka_name
        F.lit(None).cast("string").alias("license_number"),         # if you truly don't have this in Dallas
        F.lit(None).cast("string").alias("facility_type"),
        F.col("risk_desc").alias("risk_category"),
        F.col("risk_level").cast("int").alias("risk_level_num"),
        F.col("street_address").alias("location_address"),
        F.lit("DALLAS").alias("src_city"),                          # address-level city not present â†’ set
        F.lit("TX").alias("state"),                                 # or F.lit(None) if you prefer
        F.col("zip_code").cast("string").alias("zip_code"),
        F.col("inspection_date"),
        F.col("inspection_type"),
        F.col("inspection_result").alias("result_desc"),
        F.col("violation_code").cast("string").alias("violation_code"),
        # prefer description, else detail, else memo
        F.upper(F.trim(F.coalesce(F.col("violation_description"),
                                  F.col("violation_detail"),
                                  F.col("violation_memo")))).alias("violation_desc_raw"),
        F.col("violation_memo").alias("violation_comments"),
        F.col("latitude"),
        F.col("longitude"),
        F.lit(None).cast("string").alias("location"),
        F.col("inspection_score").cast("int").alias("score"),
        F.lit("FOOD_INSPECTIONS").alias("source_system"),
        F.col("load_dt"), F.col("source_file_path"), F.col("source_file_name")
    )

    # same columns -> safe union
    return chi_base.unionByName(dal_base)


In [0]:
# -------- 3) SINGLE materialized Gold table (TABLE) --------
@dlt.table(
    name="gold_inspection",
    table_properties={"delta.columnMapping.mode": "name"}
)
def gold_inspection():
    b = dlt.read_stream("gold_inspection_base")
    m = dlt.read("gold_violation_map")
    return (
        b.join(m, on="violation_code", how="left")
         .withColumn("violation_desc", F.coalesce(F.col("chicago_violation_desc"), F.col("violation_desc_raw")))
         .drop("chicago_violation_desc")
    )