In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.window import Window


def gold_inspection_df():
    return spark.read.table("workspace.`damg-midterm`.gold_inspection")


In [0]:
import dlt
from pyspark.sql import functions as F

@dlt.table(
    name="dim_date",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_date():
    # Fixed start date and dynamic end date (today)
    start_date = F.to_date(F.lit("2021-01-01"))
    end_date = F.current_date()

    # Generate one row per day from 2021-01-01 to today
    df = (
        spark.createDataFrame([(1,)], ["dummy"])
        .select(F.sequence(start_date, end_date).alias("date_seq"))
        .select(F.explode("date_seq").alias("Full_Date"))
    )

    df = (
        df.withColumn("Date_key", F.date_format("Full_Date", "yyyyMMdd").cast("int"))
          .withColumn("Year", F.year("Full_Date"))
          .withColumn("Quarter", F.quarter("Full_Date"))
          .withColumn("Month", F.month("Full_Date"))
          .withColumn("Day", F.dayofmonth("Full_Date"))
          .withColumn("Day_of_Week", F.dayofweek("Full_Date"))
          .withColumn("Is_Weekend", F.col("Day_of_Week").isin(1, 7).cast("int"))
          .withColumn("Fiscal_Year", F.year("Full_Date"))  # change if your FY is different
          .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Date_key",
        "Full_Date",
        "Year",
        "Quarter",
        "Month",
        "Day",
        "Day_of_Week",
        "Is_Weekend",
        "Fiscal_Year",
        "Source_System"
    )


In [0]:
@dlt.table(
    name="dim_location",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_location():
    src = gold_inspection_df()

    base = (
        src.select(
            F.col("location_address").alias("Street_Number"),
            F.col("src_city").alias("City"),
            F.col("state").alias("State"),
            F.col("zip_code").alias("Zip_Code"),
            F.col("latitude").alias("Latitude"),
            F.col("longitude").alias("Longitude")
        )
        .distinct()
    )

    # Define a deterministic ordering for assigning keys
    w = Window.orderBy(
        F.col("City"),
        F.col("State"),
        F.col("Zip_Code"),
        F.col("Street_Number")
    )

    df = (
        base
        .withColumn("Location_Key", F.row_number().over(w))  # starts at 1
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Location_Key",
        "Street_Number",
        "City",
        "State",
        "Zip_Code",
        "Latitude",
        "Longitude",
        "Source_System"
    )


In [0]:
from pyspark.sql.window import Window
import dlt
from pyspark.sql import functions as F

@dlt.table(
    name="dim_inspection_type",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_inspection_type():
    src = gold_inspection_df()

    # Get distinct non-null inspection types
    base = (
        src.select(F.col("inspection_type").alias("Inspection_Type"))
           .where(F.col("inspection_type").isNotNull())
           .distinct()
    )

    # Assign keys starting from 1 in a stable order
    w = Window.orderBy(F.col("Inspection_Type"))

    df = (
        base
        .withColumn("Inspection_Type_Key", F.row_number().over(w))  # 1,2,3,...
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Inspection_Type_Key",
        "Inspection_Type",
        "Source_System"
    )


In [0]:
from pyspark.sql.window import Window
import dlt
from pyspark.sql import functions as F

@dlt.table(
    name="dim_inspection_result",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_inspection_result():
    src = gold_inspection_df()

    # Distinct non-null inspection results
    base = (
        src.select(F.col("result_desc").alias("Result_Desc"))
           .where(F.col("result_desc").isNotNull())
           .distinct()
    )

    # Assign surrogate keys starting from 1
    w = Window.orderBy(F.col("Result_Desc"))

    df = (
        base
        .withColumn("Result_Key", F.row_number().over(w))  # 1,2,3,...
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Result_Key",
        "Result_Desc",
        "Source_System"
    )


In [0]:
from pyspark.sql.window import Window
import dlt
from pyspark.sql import functions as F

@dlt.table(
    name="dim_risk_category",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_risk_category():
    src = gold_inspection_df()

    base = (
        src.select(
            F.col("risk_category").alias("Risk_Desc"),
            F.col("risk_level_num").alias("Risk_Level_Num")
        )
        .where(F.col("Risk_Desc").isNotNull())
        .distinct()
    )

    # Assign keys starting from 1 in a stable order
    w = Window.orderBy(F.col("Risk_Desc"), F.col("Risk_Level_Num"))

    df = (
        base
        .withColumn("Risk_Category_Key", F.row_number().over(w))  # 1,2,3,...
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Risk_Category_Key",
        "Risk_Desc",
        "Risk_Level_Num",
        "Source_System"
    )


In [0]:
from pyspark.sql.window import Window
import dlt
from pyspark.sql import functions as F

@dlt.table(
    name="dim_violation",
    table_properties={"delta.columnMapping.mode": "name"}
)
def dim_violation():
    src = gold_inspection_df()

    base = (
        src.select(
            F.col("violation_code").cast("int").alias("Violation_Code"),
            F.col("violation_desc").alias("Violation_Desc")
        )
        # remove the null / bad violation rows
        .where(F.col("Violation_Code").isNotNull())
        .where(F.col("Violation_Desc").isNotNull())
        .distinct()
    )

    # keys starting from 1
    w = Window.orderBy(F.col("Violation_Code"))

    df = (
        base
        .withColumn("Violation_Key", F.row_number().over(w))  # 1,2,3,...
        .withColumn("Source_System", F.lit("FOOD_INSPECTIONS"))
    )

    return df.select(
        "Violation_Key",
        "Violation_Code",
        "Violation_Desc",
        "Source_System"
    )
