RawTables to Bronze Pipeline


In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
import re

@dlt.table(
    name="chicago_inspections_bronze",
    comment="Chicago food inspection data from raw table with deduplication - Streaming Table",
    table_properties={
        "quality": "bronze",
        "delta.enableChangeDataFeed": "true"
    }
)
def chicago_bronze():
    """Ingest Chicago inspection data from raw table with deduplication"""
    
    # Read from raw table as stream
    chicago_df = (
        spark.readStream
        .option("readChangeFeed", "true")
        .table("food_inspection.raw.chicago_inspections")
    )
    
    # Add ingestion metadata
    chicago_with_metadata = chicago_df \
        .withColumn("_ingestion_timestamp", current_timestamp()) \
        .withColumn("_source_file", lit("raw_table"))
    
    # Drop any reserved CDF columns if they exist to ensure clean processing
    cdf_reserved_columns = ["_change_type", "_commit_version", "_commit_timestamp"]
    for col_name in cdf_reserved_columns:
        if col_name in chicago_with_metadata.columns:
            chicago_with_metadata = chicago_with_metadata.drop(col_name)
    
    # Create hash for deduplication (exclude metadata columns)
    chicago_columns = [c for c in chicago_with_metadata.columns if not c.startswith("_")]
    chicago_with_hash = chicago_with_metadata.withColumn(
        "row_hash",
        sha2(concat_ws("||", *[coalesce(col(c), lit("NULL")) for c in chicago_columns]), 256)
    )
    
    # Deduplicate using watermark for streaming and drop the hash column
    chicago_deduped = chicago_with_hash \
        .withWatermark("_ingestion_timestamp", "1 hour") \
        .dropDuplicates(["row_hash"]) \
        .drop("row_hash")
    
    return chicago_deduped

@dlt.table(
    name="dallas_inspections_bronze",
    comment="Dallas food inspection data from raw table with deduplication - Streaming Table",
    table_properties={
        "quality": "bronze",
        "delta.enableChangeDataFeed": "true"
    }
)
def dallas_bronze():
    """Ingest Dallas inspection data from raw table with deduplication"""
    
    # Read from raw table as stream
    dallas_df = (
        spark.readStream
        .option("readChangeFeed", "true")
        .table("food_inspection.raw.dallas_inspections")
    )
    
    # Add ingestion metadata
    dallas_with_metadata = dallas_df \
        .withColumn("_ingestion_timestamp", current_timestamp()) \
        .withColumn("_source_file", lit("raw_table"))
    
    # Drop any reserved CDF columns if they exist to ensure clean processing
    cdf_reserved_columns = ["_change_type", "_commit_version", "_commit_timestamp"]
    for col_name in cdf_reserved_columns:
        if col_name in dallas_with_metadata.columns:
            dallas_with_metadata = dallas_with_metadata.drop(col_name)
    
    # Create hash for deduplication (exclude metadata columns)
    dallas_columns = [c for c in dallas_with_metadata.columns if not c.startswith("_")]
    dallas_with_hash = dallas_with_metadata.withColumn(
        "row_hash",
        sha2(concat_ws("||", *[coalesce(col(c), lit("NULL")) for c in dallas_columns]), 256)
    )
    
    # Deduplicate using watermark for streaming and drop the hash column
    dallas_deduped = dallas_with_hash \
        .withWatermark("_ingestion_timestamp", "1 hour") \
        .dropDuplicates(["row_hash"]) \
        .drop("row_hash")
    
    return dallas_deduped