Bronze_to_Silver_chicagoTables_with_Expectations

In [0]:
import dlt

 from pyspark.sql.functions import *

 from pyspark.sql.window import Window

 from pyspark.sql.types import *

 from functools import reduce

 from operator import add

 import re
 
@dlt.table(
    name="silver.chicago_inspections_silver",
    comment="Validated Chicago inspection data with quality rules applied - Streaming",
    table_properties={
        "quality": "silver",
        "delta.enableChangeDataFeed": "true"
    }

 )

 @dlt.expect_all_or_drop({
    "valid_dba_name": "dba_name IS NOT NULL",
    "valid_inspection_date": "inspection_date IS NOT NULL",
    "valid_inspection_type": "inspection_type IS NOT NULL",
    "valid_zip": "zip IS NOT NULL AND LENGTH(zip) = 5",
    "valid_results": "results IS NOT NULL",
    "valid_violations": "violations IS NOT NULL AND LENGTH(TRIM(violations)) > 0",
    "min_violation_count": "violation_count >= 1"

 })

 def chicago_silver():
    """Process Chicago bronze to silver with validation rules as streaming table"""
    
    chicago_df = dlt.read_stream("bronze.chicago_inspections_bronze")
    
    # Drop CDC columns if they exist
    cdc_columns = ["_change_type", "_commit_version", "_commit_timestamp"]
    for col_name in cdc_columns:
        if col_name in chicago_df.columns:
            chicago_df = chicago_df.drop(col_name)
    
    chicago_df = chicago_df.withColumn(
        "zip",
        when(col("zip").isNotNull(),
             regexp_replace(col("zip"), r"\.0$", "")).otherwise(None)
    )
    
    chicago_silver = chicago_df.withColumn(
        "violation_count",
        size(split(col("violations"), r"\|"))
    ).withColumn(
        "violation_score",
        when(col("results") == "Pass", 90)
        .when(col("results") == "Pass w/ Conditions", 80)
        .when(col("results") == "Fail", 70)
        .when(col("results") == "No Entry", 0)
        .otherwise(None)
    ).withColumn(
        "source_city", lit("CHI")
    )
    
    return chicago_silver