In [0]:
import dlt
from pyspark.sql.functions import current_timestamp, col

@dlt.table(
    table_properties={
        "delta.columnMapping.mode": "name"
    }
)
def dallas_bronze():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("inferSchema", "true")
           # .option("pathGlobFilter", "Dallas_Bronze_Layer.csv")  # Only Dallas file
            .load("/Volumes/workspace/damg7370/datastore/Food_Inspections/Dallas")
            .select(
                "*",
                current_timestamp().alias("load_dt"),
                col("_metadata.file_path").alias("source_file_path"),
                col("_metadata.file_name").alias("source_file_name")
            )
    )

In [0]:
import dlt
from pyspark.sql.functions import col, trim, to_date, regexp_extract, regexp_replace, when, array, explode, struct, concat, lit
from pyspark.sql.types import IntegerType, DoubleType

# Data quality rules for Dallas
dallas_rules = {
    "valid_business_name": "business_name IS NOT NULL",
    "valid_inspection_date": "inspection_date IS NOT NULL",
    "valid_inspection_type": "inspection_type IS NOT NULL",
    "valid_zip": "zip_code IS NOT NULL AND LENGTH(CAST(zip_code AS STRING)) = 5",
    "valid_inspection_score": "inspection_score <= 100",
    "valid_violation": "violation_code IS NOT NULL"
}

@dlt.table(
    table_properties={
        "delta.columnMapping.mode": "name"
    }
)
@dlt.expect_all_or_drop(dallas_rules)
def dallas_silver_exploded():
    violations = []
    
    # Get list of columns to check which memo columns exist
    bronze_df = dlt.read_stream("dallas_bronze")
    available_cols = bronze_df.columns
    
    for i in range(1, 26):
        # Check if memo column exists for this violation number
        memo_col = f"Violation Memo - {i}"
        has_memo = memo_col in available_cols
        
        violations.append(
            when(col(f"Violation Description - {i}").isNotNull() & (col(f"Violation Description - {i}") != ""),
                struct(
                    regexp_extract(col(f"Violation Description - {i}"), r"\*(\d+)", 1).alias("violation_code"),
                    regexp_replace(col(f"Violation Description - {i}"), r"\*\d+\s*", "").alias("violation_description"),
                    col(f"Violation Points - {i}").alias("violation_points"),
                    col(f"Violation Detail - {i}").alias("violation_detail"),
                    col(memo_col).alias("violation_memo") if has_memo else lit(None).alias("violation_memo")
                )
            )
        )
    
    return (
        bronze_df
            .select("*", array(*violations).alias("violations"))
            .select("*", explode("violations").alias("violation"))
            .withColumn("inspection_result",
                when(col("Inspection Score") >= 90, "Pass")
                .when((col("Inspection Score") >= 80) & (col("Inspection Score") < 90), "Pass w/ Conditions")
                .when((col("Inspection Score") >= 70) & (col("Inspection Score") < 80), "Fail")
                .when(col("Inspection Score") == 0, "No Entry")
                .otherwise(None)
            )
            .withColumn("risk_level", col("violation.violation_points").cast(IntegerType()))
            .withColumn("risk_desc",
                when(col("risk_level") == 1, "High")
                .when(col("risk_level") == 2, "Medium")
                .when(col("risk_level") == 3, "Low")
                .otherwise(None)
            )
            .select(
                concat(col("Zip Code"), col("Inspection Score")).cast(IntegerType()).alias("inspection_id"),
                trim(regexp_replace(col("Business_Name"), r"#\d+", "")).alias("business_name"),
                trim(col("Inspection Type")).alias("inspection_type"),
                to_date(col("Inspection Date"), "yyyy-MM-dd").alias("inspection_date"),
                col("Inspection Score").cast(IntegerType()).alias("inspection_score"),
                col("inspection_result"),
                col("risk_level"),  # Renamed from violation_points
                col("risk_desc"),   # New column: High/Medium/Low
                trim(col("Street Address")).alias("street_address"),
                col("Zip Code").cast(IntegerType()).alias("zip_code"),
                col("Lat").cast(DoubleType()).alias("latitude"),
                col("Long").cast(DoubleType()).alias("longitude"),
                col("violation.violation_code").cast(IntegerType()).alias("violation_code"),
                trim(col("violation.violation_description")).alias("violation_description"),
                trim(col("violation.violation_detail")).alias("violation_detail"),
                trim(col("violation.violation_memo")).alias("violation_memo"),
                col("load_dt"),
                col("source_file_path"),
                col("source_file_name")
            )
            .filter(col("violation_code").isNotNull())
    )