In [0]:
import dlt
from pyspark.sql.functions import current_timestamp, col

@dlt.table(
    table_properties={
        "delta.columnMapping.mode": "name"
    }
)
def chicago_bronze():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load("/Volumes/workspace/damg-midterm/datastore/Food_Inspections/Chicago")
            .select(
                "*",
                current_timestamp().alias("load_dt"),
                col("_metadata.file_path").alias("source_file_path"),
                col("_metadata.file_name").alias("source_file_name")
            )
    )

In [0]:
import dlt
from pyspark.sql.functions import col, explode, split, trim, to_date, regexp_extract, regexp_replace
from pyspark.sql.types import IntegerType, DoubleType

# Data quality rules for Chicago
chicago_rules = {
    "valid_business_name": "business_name IS NOT NULL",
    "valid_inspection_date": "inspection_date IS NOT NULL",
    "valid_inspection_type": "inspection_type IS NOT NULL",
    "valid_zip": "zip IS NOT NULL AND LENGTH(CAST(zip AS STRING)) = 5",
    "valid_results": "results IS NOT NULL",
    "valid_violation": "violation_code IS NOT NULL",
    "valid_urgent_critical": "NOT ((LOWER(violation_comments) LIKE '%urgent%' OR LOWER(violation_comments) LIKE '%critical%') AND UPPER(results) = 'PASS')"
}

@dlt.table(
    table_properties={
        "delta.columnMapping.mode": "name"
    }
)
@dlt.expect_all_or_drop(chicago_rules)
def chicago_silver_exploded():
    return (
        dlt.read_stream("chicago_bronze")
            .withColumn("violation_array", split(col("Violations"), "\\|"))
            .withColumn("violation", explode(col("violation_array")))
            .withColumn("violation", trim(col("violation")))  # Trim first
            .withColumn("violation_code", regexp_extract(col("violation"), r"(\d+)\.", 1).cast(IntegerType()))
            .withColumn("violation_description", trim(regexp_extract(col("violation"), r"^\d+\.\s*(.+?)(?:\s+-\s+Comments:)", 1)))
            .withColumn("violation_comments", regexp_extract(col("violation"), r"Comments:\s*(.*)", 1))
            .select(
                col("Inspection ID").cast(IntegerType()).alias("inspection_id"),
                trim(regexp_replace(col("Business_Name"), r"#\d+", "")).alias("business_name"),
                trim(col("AKA Name")).alias("aka_name"),
                col("License #").cast(IntegerType()).alias("license_number"),
                trim(col("Facility Type")).alias("facility_type"),
                regexp_extract(col("Risk"), r"Risk (\d+)", 1).cast(IntegerType()).alias("risk_level"),
                regexp_extract(col("Risk"), r"\((.*?)\)", 1).alias("risk_desc"),
                trim(col("Address")).alias("address"),
                trim(col("City")).alias("city"),
                trim(col("State")).alias("state"),
                trim(col("Zip")).cast(IntegerType()).alias("zip"),
                to_date(col("Inspection Date"), "M/d/yyyy").alias("inspection_date"),
                trim(col("Inspection Type")).alias("inspection_type"),
                trim(col("Results")).alias("results"),
                col("violation_code"),
                col("violation_description"),
                col("violation_comments"),
                col("violation"),
                col("Latitude").cast(DoubleType()).alias("latitude"),
                col("Longitude").cast(DoubleType()).alias("longitude"),
                trim(col("Location")).alias("location"),
                col("Score").cast(IntegerType()).alias("score"),
                col("load_dt"),
                col("source_file_path"),
                col("source_file_name")
            )
            .filter(col("violation") != "")
    )