Dim_Date

In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from functools import reduce
from operator import add
 
# =====================================================
# DIMENSION TABLES - ALL STREAMING WITH NATURAL KEYS
# =====================================================
 
@dlt.table(
    name="gold.dim_date",
    comment="Date dimension table with calendar attributes - Streaming with Natural Keys",
    table_properties={
        "quality": "gold",
        "delta.enableChangeDataFeed": "true"
    }
)
def dim_date():
    """Create date dimension as a streaming table using date as natural key"""
   
    # Read from silver streaming tables
    chicago_dates = dlt.read_stream("silver.chicago_inspections_silver").select("inspection_date")
    dallas_dates = dlt.read_stream("silver.dallas_inspections_silver").select("inspection_date")
   
    # Union all dates
    all_dates = chicago_dates.union(dallas_dates).select(
        to_date(col("inspection_date")).alias("date_col")
    ).dropDuplicates(["date_col"])
   
    # Create date dimension with natural key
    date_dim = all_dates.select(
        date_format(col("date_col"), "yyyyMMdd").cast("int").alias("date_id_nk_pk"),  # Natural key
        col("date_col").alias("date_num"),
        date_format(col("date_col"), "EEEE").alias("day_name"),
        dayofmonth(col("date_col")).alias("day_num"),
        date_format(col("date_col"), "E").alias("day_abbr"),
        when(dayofweek(col("date_col")).isin(1, 7), True).otherwise(False).alias("is_weekend"),
        date_format(col("date_col"), "MMMM").alias("month_name"),
        date_format(col("date_col"), "MMM").alias("month_abbr"),
        month(col("date_col")).alias("month_num"),
        dayofweek(col("date_col")).alias("dt"),
        year(col("date_col")).alias("year_num"),
        date_format(col("date_col"), "yyyyMMdd").cast("varchar(50)").alias("data_source_id"),
        date_format(col("date_col"), "yyyy-MM-dd").alias("data_workflow_name"),
        date_format(col("date_col"), "yyyyMMdd").cast("varchar(50)").alias("dw_job_id"),
        current_timestamp().alias("dw_load_dt")
    )
   
    return date_dim

Dim_Location 

In [0]:
@dlt.table(
    name="gold.dim_location",
    comment="Location dimension with address and geographic details - Streaming with Natural Keys",
    table_properties={
        "quality": "gold",
        "delta.enableChangeDataFeed": "true"
    }
)
def dim_location():
    """Create location dimension as a streaming table using location_id as natural key"""
   
    # Chicago locations
    chicago_locations = dlt.read_stream("silver.chicago_inspections_silver").select(
        concat_ws("_", lit("CHI"), col("address"), col("zip")).alias("location_id_nk_pk"),  # Natural key
        col("address").alias("street_address"),
        col("city"),
        col("state"),
        col("zip"),
        col("latitude").cast("double"),
        col("longitude").cast("double"),
        lit("CHI").alias("source_city")
    ).dropDuplicates(["location_id_nk_pk"])
   
    # Dallas locations
    dallas_locations = dlt.read_stream("silver.dallas_inspections_silver").select(
        concat_ws("_", lit("DAL"), col("street_address"), col("zip_code")).alias("location_id_nk_pk"),  # Natural key
        col("street_address"),
        lit("Dallas").alias("city"),
        lit("TX").alias("state"),
        col("zip_code").alias("zip"),
        col("lat").cast("double").alias("latitude"),
        col("long").cast("double").alias("longitude"),
        lit("DAL").alias("source_city")
    ).dropDuplicates(["location_id_nk_pk"])
   
    # Union locations
    all_locations = chicago_locations.unionByName(dallas_locations)
   
    # Format columns without surrogate keys
    dim_location = all_locations.select(
        col("location_id_nk_pk").cast("varchar(200)"),  # Natural key as primary key
        col("street_address").cast("varchar(150)"),
        col("city").cast("varchar(50)"),
        col("state").cast("varchar(50)"),
        col("zip").cast("varchar(50)"),
        col("latitude").cast("float"),
        col("longitude").cast("float"),
        col("location_id_nk_pk").cast("varchar(200)").alias("location"),
        col("location_id_nk_pk").cast("varchar(200)").alias("data_source_id"),
        col("source_city").cast("varchar(50)").alias("data_workflow_name"),
        col("location_id_nk_pk").cast("varchar(200)").alias("dw_job_id"),
        current_timestamp().alias("dw_load_dt")
    )
   
    return dim_location