In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import dlt

def gold_inspection_df():
    return spark.read.table("workspace.`damg-midterm`.gold_inspection")

def dim(table_name):
    return spark.read.table(f"workspace.`damg-midterm`.{table_name}")


In [0]:
@dlt.table(
    name="fact_inspection",
    table_properties={"delta.columnMapping.mode": "name"}
)
def fact_inspection():
    g = gold_inspection_df().alias("g")

    w_viols = Window.partitionBy("inspection_id_src")
    g = g.withColumn("Violation_Count", F.count(F.lit(1)).over(w_viols))

    dim_date     = dim("dim_date").alias("dd")
    dim_loc      = dim("dim_location").alias("dl")
    dim_risk     = dim("dim_risk_category").alias("dr")
    dim_it       = dim("dim_inspection_type").alias("dit")
    dim_result   = dim("dim_inspection_result").alias("dir")
    dim_viol     = dim("dim_violation").alias("dv")
    dim_business = dim("dim_business").alias("db")

    fact = (
        g.join(
            dim_business,
            (
                F.upper(F.col("g.business_name")) ==
                F.upper(F.col("db.Business_Name"))
            )
            & (
                F.coalesce(F.col("g.facility_type"), F.lit("")) ==
                F.coalesce(F.col("db.Facility_Type"), F.lit(""))
            )
            & (F.col("db.Is_Active") == F.lit(1)),
            "left"
        )
        .join(
            dim_date,
            F.col("g.inspection_date") == F.col("dd.Full_Date"),
            "left"
        )
        .join(
            dim_loc,
            (F.col("g.location_address") == F.col("dl.Street_Number")) &
            (F.col("g.src_city")         == F.col("dl.City")) &
            (F.col("g.state")            == F.col("dl.State")) &
            (F.col("g.zip_code")         == F.col("dl.Zip_Code")),
            "left"
        )
        .join(
            dim_risk,
            (F.col("g.risk_category")  == F.col("dr.Risk_Desc")) &
            (F.col("g.risk_level_num") == F.col("dr.Risk_Level_Num")),
            "left"
        )
        .join(
            dim_it,
            F.col("g.inspection_type") == F.col("dit.Inspection_Type"),
            "left"
        )
        .join(
            dim_result,
            F.col("g.result_desc") == F.col("dir.Result_Desc"),
            "left"
        )
        .join(
            dim_viol,
            F.col("g.violation_code").cast("int") == F.col("dv.Violation_Code"),
            "left"
        )
    )

    w_key = Window.orderBy("g.inspection_id_src", "dv.Violation_Key")
    fact = fact.withColumn("Inspection_Key", F.row_number().over(w_key))

    return fact.select(
        F.col("Inspection_Key"),
        F.col("db.Business_Key").alias("Business_Key"),
        F.col("dl.Location_Key").alias("Location_Key"),
        F.col("dd.Date_key").alias("Date_Key"),
        F.col("dv.Violation_Key").alias("Violation_Key"),
        F.col("dit.Inspection_Type_Key").alias("Inspection_Type_Key"),
        F.col("dir.Result_Key").alias("Inspection_Result_Key"),
        F.col("dr.Risk_Category_Key").alias("Risk_Category_Key"),
        F.col("g.score").cast("int").alias("Inspection_Score"),
        F.col("Violation_Count").cast("int").alias("Violation_Count"),
        F.col("g.violation_comments").alias("Inspector_Comments"),
        F.lit("FOOD_INSPECTIONS").alias("Source_System")
    )
