In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
def silver_df():
    return spark.read.table("workspace.`damg7370-la-crime`.crime_silver")


In [0]:
@dlt.table(
    name="dim_premise",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_premise():
    src = silver_df()

    df = (
        src.select("PREMIS_CD", "PREMIS_DESC")
           .where(F.col("PREMIS_CD").isNotNull())
           .dropDuplicates()
           .withColumnRenamed("PREMIS_CD", "premise_key")
           .withColumnRenamed("PREMIS_DESC", "premise_desc")
           .withColumn(
                "premise_type",
                F.when(F.col("premise_desc").rlike("(?i)RESIDENCE|APARTMENT|HOME|HOUSE"), "Residential")
                 .when(F.col("premise_desc").rlike("(?i)SCHOOL|COLLEGE|UNIVERSITY"), "Educational")
                 .when(F.col("premise_desc").rlike("(?i)PARK|PLAYGROUND|RECREATION"), "Park/Outdoor")
                 .when(F.col("premise_desc").rlike("(?i)BAR|RESTAURANT|CAFE|CLUB"), "Food & Entertainment")
                 .otherwise("Other")
            )
           .withColumn("Source_System", F.lit("LA_CRIME"))
    )

    return df.select(
        "premise_key",
        "premise_desc",
        "premise_type",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_weapon",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_weapon():
    src = silver_df()

    df = (
        src.select("WEAPON_USED_CD", "WEAPON_DESC")
           .where(F.col("WEAPON_USED_CD").isNotNull())
           .dropDuplicates()
           .withColumnRenamed("WEAPON_USED_CD", "weapon_key")
           .withColumnRenamed("WEAPON_DESC", "weapon_desc")
           .withColumn(
               "weapon_category",
               F.when(F.col("weapon_desc").rlike("(?i)GUN|FIREARM|HANDGUN|RIFLE"), "Firearm")
                .when(F.col("weapon_desc").rlike("(?i)KNIFE|STAB|CUTTING"), "Knife")
                .when(F.col("weapon_desc").rlike("(?i)BLUNT|CLUB|BAT|HAMMER"), "Blunt Object")
                .when(F.col("weapon_desc").rlike("(?i)FIST|HANDS|FEET|PERSONAL"), "Personal Weapon")
                .otherwise("Other")
           )
           .withColumn("Source_System", F.lit("LA_CRIME"))
    )

    return df.select(
        "weapon_key",
        "weapon_desc",
        "weapon_category",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_status",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_status():
    src = silver_df()

    df = (
        src.select("STATUS", "STATUS_DESC")
           .where(F.col("STATUS").isNotNull())
           .dropDuplicates()
           .withColumnRenamed("STATUS", "status_key")
           .withColumnRenamed("STATUS_DESC", "status_desc")
           .withColumn(
                "arrest_category",
                F.when(F.col("status_key").rlike("^A"), "Adult Arrest")
                 .when(F.col("status_key").rlike("^J"), "Juvenile Arrest")
                 .otherwise("No Arrest")
            )
           .withColumn("Source_System", F.lit("LA_CRIME"))
    )

    return df.select(
        "status_key",
        "status_desc",
        "arrest_category",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_victim",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_victim():
    src = silver_df()

    base = (
        src.select("VICT_AGE_CLEAN", "VICT_SEX", "VICT_DESCENT")
           .dropDuplicates()
    )

    w = Window.orderBy("VICT_AGE_CLEAN", "VICT_SEX", "VICT_DESCENT")

    df = (
        base
        .withColumn("victim_key", F.row_number().over(w)) #surrogate key because we don't have a natural key for victim attribute in the source
        .withColumn(
            "age_group",
            F.when(F.col("VICT_AGE_CLEAN").isNull(), "Unknown")
             .when(F.col("VICT_AGE_CLEAN") < 18, "0-17")
             .when(F.col("VICT_AGE_CLEAN") <= 24, "18-24")
             .when(F.col("VICT_AGE_CLEAN") <= 34, "25-34")
             .when(F.col("VICT_AGE_CLEAN") <= 44, "35-44")
             .when(F.col("VICT_AGE_CLEAN") <= 54, "45-54")
             .when(F.col("VICT_AGE_CLEAN") <= 64, "55-64")
             .otherwise("65+")
        )
        .withColumn("Source_System", F.lit("LA_CRIME"))
    )

    return df.select(
        "victim_key",
        F.col("VICT_AGE_CLEAN").alias("age"),
        "age_group",
        F.col("VICT_SEX").alias("gender"),
        F.col("VICT_DESCENT").alias("descent_code"),
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_crime_code",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_crime_code():
    src = silver_df()

    base = (
        src.select("CRM_CD", "CRM_CD_DESC")
           .where(F.col("CRM_CD").isNotNull())
           .dropDuplicates()
    )

    df = (
        base.withColumnRenamed("CRM_CD", "crime_code_key")
            .withColumnRenamed("CRM_CD_DESC", "crime_desc")
            # Simple classification â€“ you can refine these if you want
            .withColumn(
                "crime_category",
                F.when(F.col("crime_desc").rlike("(?i)HOMICIDE|ASSAULT|RAPE|ROBBERY"), "Violent")
                 .when(F.col("crime_desc").rlike("(?i)BURGLARY|THEFT|LARCENY|VANDALISM"), "Property")
                 .otherwise("Other")
            )
            .withColumn(
                "is_violent_crime",
                F.col("crime_category") == F.lit("Violent")
            )
            .withColumn(
                "is_property_crime",
                F.col("crime_category") == F.lit("Property")
            )
            .withColumn("Source_System", F.lit("LA_CRIME"))
    )

    return df.select(
        "crime_code_key",
        "crime_desc",
        "crime_category",
        "is_violent_crime",
        "is_property_crime",
        "Source_System"
    )
