In [0]:
import dlt
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
def silver_df():
    return spark.read.table("workspace.`damg7370-la-crime`.crime_silver")

In [0]:
@dlt.table(
    name="dim_date",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_date():

    df = dlt.read("crime_silver")

    # get min/max date
    bounds = df.select(
        F.min("DATE_OCC").alias("min_date"),
        F.max("DATE_OCC").alias("max_date")
    ).first()

    start_date = bounds["min_date"]
    end_date = bounds["max_date"]

    # generate full calendar
    calendar = (
        spark.range(1)
            .select(F.sequence(F.lit(start_date), F.lit(end_date), F.expr("INTERVAL 1 DAY")).alias("date_list"))
            .withColumn("Full_Date", F.explode("date_list"))
    )

    final = (
        calendar
        .withColumn("Date_Key", F.date_format("Full_Date", "yyyyMMdd").cast("int"))
        .withColumn("Day_of_week", F.dayofweek("Full_Date"))
        .withColumn("Day_Name", F.date_format("Full_Date", "EEEE"))
        .withColumn("Month_number", F.month("Full_Date"))
        .withColumn("Month_Name", F.date_format("Full_Date", "MMMM"))
        .withColumn("Quarter", F.quarter("Full_Date"))
        .withColumn("Year", F.year("Full_Date"))
        .withColumn("is_weekend", F.when(F.dayofweek("Full_Date").isin(1,7), True).otherwise(False))
        .withColumn("is_holiday", F.lit(False))
        
    )

    return final.select(
        "Date_Key", "Full_Date", "Day_of_week", "Day_Name",
        "Month_number", "Month_Name", "Quarter", "Year",
        "is_weekend", "is_holiday"
    ).distinct()

In [0]:
@dlt.table(
    name="dim_time",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_time():

    df = dlt.read("crime_silver")

    # TIME_OCC comes as integer HHMM, we convert using math instead of strings (PHOTON-SAFE)
    df = df.withColumn("time_int", F.col("TIME_OCC").cast("int"))

    # hour = FLOOR(HHMM / 100)
    df = df.withColumn("hour_24", (F.col("time_int") / 100).cast("int"))

    # minute = HHMM % 100
    df = df.withColumn("minute", (F.col("time_int") % 100).cast("int"))

    # Time_Key = hour * 100 + minute
    df = df.withColumn("Time_Key", (F.col("hour_24") * 100 + F.col("minute")).cast("int"))

    # Full_time = hour:minute â†’ build WITHOUT format_string()
    df = df.withColumn(
        "Full_time",
        F.concat_ws(
            ":",
            F.lpad(F.col("hour_24").cast("string"), 2, "0"),
            F.lpad(F.col("minute").cast("string"), 2, "0")
        )
    )

    # 24-hour day broken into 6 buckets of 4 hours
    df = df.withColumn("time_bucket", (F.col("hour_24") / 4).cast("int"))

    # time_of_day classification
    df = df.withColumn(
        "time_of_day",
        F.when((F.col("hour_24") >= 5) & (F.col("hour_24") <= 11), "Morning")
         .when((F.col("hour_24") >= 12) & (F.col("hour_24") <= 16), "Afternoon")
         .when((F.col("hour_24") >= 17) & (F.col("hour_24") <= 20), "Evening")
         .otherwise("Night")
    )

    return df.select(
        "Time_Key",
        "hour_24",
        "minute",
        "time_bucket",
        "time_of_day",
        "Full_time"
    ).dropDuplicates()

In [0]:
from pyspark.sql.types import (
    StructType, StructField, LongType, StringType,
    IntegerType, DoubleType
)

@dlt.table(
    name="dim_location",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_location():

    # Read from Silver layer
    src = dlt.read("crime_silver")

    # Base unique locations
    base = (
        src.select("LOCATION", "CROSS_STREET", "AREA", "AREA_NAME", "LAT", "LON")
           .dropDuplicates()
    )

    # Surrogate key
    base = base.withColumn(
        "Location_Key",
        F.when(
            F.col("LAT").isNull() | F.col("LON").isNull(),
            F.lit(0).cast("long")
        ).otherwise(
            F.xxhash64(
                "LOCATION", "CROSS_STREET", "AREA", "AREA_NAME", "LAT", "LON"
            ).cast("long")
        )
    ).withColumn("Source_System", F.lit("LA_CRIME"))

    base = base.select(
        "Location_Key",
        "LOCATION",
        "CROSS_STREET",
        "AREA",
        "AREA_NAME",
        "LAT",
        "LON",
        "Source_System"
    )

    # ---------- UNKNOWN LOCATION ROW (EXPLICIT SCHEMA) ----------
    generic_schema = StructType([
        StructField("Location_Key", LongType(), False),
        StructField("LOCATION", StringType(), True),
        StructField("CROSS_STREET", StringType(), True),
        StructField("AREA", IntegerType(), True),
        StructField("AREA_NAME", StringType(), True),
        StructField("LAT", DoubleType(), True),
        StructField("LON", DoubleType(), True),
        StructField("Source_System", StringType(), True)
    ])

    generic = spark.createDataFrame(
        [
            (
                0,
                "City of Los Angeles (Unknown location)",
                None,
                None,
                "City of Los Angeles",
                None,
                None,
                "LA_CRIME"
            )
        ],
        schema=generic_schema
    )

    # combine base + unknown
    df = base.unionByName(generic, allowMissingColumns=True).dropDuplicates(["Location_Key"])

    return df