Bronze_to_Silver_DallasTables_with_Expectations

In [0]:
@dlt.table(
    name="silver.dallas_inspections_silver",
    comment="Validated Dallas inspection data with all quality rules applied - Streaming",
    table_properties={
        "quality": "silver",
        "delta.enableChangeDataFeed": "true"
    }
)
@dlt.expect_all_or_drop({
    "valid_restaurant_name": "restaurant_name IS NOT NULL AND TRIM(restaurant_name) != ''",
    "valid_inspection_date": "inspection_date IS NOT NULL",
    "valid_inspection_type": "inspection_type IS NOT NULL AND TRIM(inspection_type) != ''",
    "valid_zip_format": "zip_code IS NOT NULL AND zip_code RLIKE '^\\\\d{5}$'",
    "valid_score_range": "inspection_score IS NOT NULL AND inspection_score >= 0 AND inspection_score <= 100",
    "min_violation_count": "violation_count >= 1",
    "high_score_violation_limit": "inspection_score < 90 OR violation_count <= 3",
    "no_pass_with_critical": "NOT (inspection_score >= 70 AND has_critical_violation = true)"
})
def dallas_silver():
    """Process Dallas bronze to silver with ALL validation rules as streaming table"""
    
    dallas_df = dlt.read_stream("bronze.dallas_inspections_bronze")
    
    # Drop CDC columns if they exist
    cdc_columns = ["_change_type", "_commit_version", "_commit_timestamp"]
    for col_name in cdc_columns:
        if col_name in dallas_df.columns:
            dallas_df = dallas_df.drop(col_name)
    
    for i in range(1, 26):
        dallas_df = dallas_df.withColumn(
            f"has_violation_{i}",
            when(
                col(f"violation_description_{i}").isNotNull() |
                col(f"violation_points_{i}").isNotNull(),
                1
            ).otherwise(0)
        )
    
    violation_cols = [f"has_violation_{i}" for i in range(1, 26)]
    dallas_df = dallas_df.withColumn(
        "violation_count",
        expr(" + ".join(violation_cols))
    )
    
    memo_cols = [f"violation_memo_{i}" for i in range(1, 26)]
    dallas_df = dallas_df.withColumn(
        "all_memos",
        concat_ws(" ", *[coalesce(col(c), lit("")) for c in memo_cols])
    )
    
    dallas_df = dallas_df.withColumn(
        "has_critical_violation",
        when(
            col("all_memos").contains("Urgent") |
            col("all_memos").contains("Critical") |
            col("all_memos").contains("URGENT") |
            col("all_memos").contains("CRITICAL"),
            True
        ).otherwise(False)
    )
    
    dallas_df = dallas_df.withColumn("source_city", lit("DAL"))
    dallas_df = dallas_df.withColumn("derived_score", col("inspection_score"))
    
    dallas_df = dallas_df.drop("all_memos", *violation_cols)
    
    return dallas_df