In [0]:
import dlt
from pyspark.sql import functions as F

def gold_inspection_df():
    return spark.read.table("workspace.`damg-midterm`.gold_inspection")


In [0]:
@dlt.table(
    name="dim_date",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_date():
    src = gold_inspection_df()

    df = (
        src
        .select(F.col("inspection_date").alias("Full_Date"))
        .where(F.col("Full_Date").isNotNull())
        .distinct()
        .withColumn("Date_key", F.date_format("Full_Date", "yyyyMMdd").cast("int"))
        .withColumn("Year", F.year("Full_Date"))
        .withColumn("Quarter", F.quarter("Full_Date"))
        .withColumn("Month", F.month("Full_Date"))
        .withColumn("Day", F.dayofmonth("Full_Date"))
        .withColumn("Day_of_Week", F.dayofweek("Full_Date"))
        .withColumn("Is_Weekend", F.col("Day_of_Week").isin(1, 7).cast("int"))
        .withColumn("Fiscal_Year", F.year("Full_Date"))
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Date_key",
        "Full_Date",
        "Year",
        "Quarter",
        "Month",
        "Day",
        "Day_of_Week",
        "Is_Weekend",
        "Fiscal_Year",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_location",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_location():
    src = gold_inspection_df()

    df = (
        src
        .select(
            F.col("location_address").alias("Street_Address"),
            F.col("src_city").alias("City"),
            F.col("state").alias("State"),
            F.col("zip_code").alias("Zip_Code"),
            F.col("latitude").alias("Latitude"),
            F.col("longitude").alias("Longitude")
        )
        .distinct()
        .withColumn("Location_Key", F.monotonically_increasing_id())
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Location_Key",
        "Street_Address",
        "City",
        "State",
        "Zip_Code",
        "Latitude",
        "Longitude",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_inspection_type",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_inspection_type():
    src = gold_inspection_df()

    df = (
        src
        .select(F.col("inspection_type").alias("Inspection_Description"))
        .where(F.col("Inspection_Description").isNotNull())
        .distinct()
        .withColumn("Inspection_Type_Key", F.monotonically_increasing_id())
        # If you want a separate "Type" classification later, adjust here
        .withColumn("Type", F.col("Inspection_Description"))
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Inspection_Type_Key",
        "Inspection_Description",
        "Type",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_inspection_result",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_inspection_result():
    src = gold_inspection_df()

    df = (
        src
        .select(
            F.col("result_desc").alias("Result_Desc"),
            F.col("violation_comments").alias("Inspector_comments"),
            F.col("score").alias("Score")
        )
        .where(F.col("Result_Desc").isNotNull())
        .distinct()
        .withColumn("Result_Key", F.monotonically_increasing_id())
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Result_Key",
        "Result_Desc",
        "Inspector_comments",
        "Score",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_risk_category",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_risk_category():
    src = gold_inspection_df()

    df = (
        src
        .select(
            F.col("risk_category").alias("Risk_Desc"),
            F.col("risk_level_num").alias("Risk_Level_Num")
        )
        .where(F.col("Risk_Desc").isNotNull())
        .distinct()
        .withColumn("Risk_Category_Key", F.monotonically_increasing_id())
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Risk_Category_Key",
        "Risk_Desc",
        "Risk_Level_Num",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_violation",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_violation():
    src = gold_inspection_df()

    df = (
        src
        .select(
            F.col("violation_code").cast("int").alias("Violation_Code"),
            # Use standardized description if present, else raw
            F.coalesce(F.col("violation_desc"), F.col("violation_desc_raw")).alias("Violation_Desc")
        )
        .where(F.col("Violation_Code").isNotNull())
        .distinct()
        .withColumn("Violation_Key", F.monotonically_increasing_id())
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Violation_Key",
        "Violation_Code",
        "Violation_Desc",
        "Source_System"
    )
