In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import dlt

In [0]:
def gold_inspection_df():
    return spark.read.table("workspace.`damg-midterm`.gold_inspection")

def dim(table_name):
    return spark.read.table(f"workspace.`damg-midterm`.{table_name}")


In [0]:
@dlt.table(
    name="fact_inspection",
    table_properties={"delta.columnMapping.mode": "name"}
)
def fact_inspection():
    # 1) Start from gold
    g = gold_inspection_df()

    # 2) Add Business_Natural_Key (same logic as Business dim)
    g = (
        g.withColumn("License_Number", F.col("license_number").cast("string"))
         .withColumn("Dataset_City", F.col("city"))
         .withColumn(
             "Business_Natural_Key",
             F.when(
                 (F.col("License_Number").isNotNull()) & (F.col("License_Number") != "0"),
                 # Chicago → license directly
                 F.col("License_Number").cast("bigint")
             ).otherwise(
                 # Dallas → 7-digit numeric hash of name + dataset city
                 (
                     F.abs(
                         F.hash(
                             F.upper(F.trim(F.col("business_name"))),
                             F.upper(F.trim(F.col("Dataset_City")))
                         )
                     ) % 10000000
                 ).cast("int")
             )
         )
         .withColumn(
            "result_desc_norm",
            F.when(
                F.col("result_desc").isNull() | (F.trim(F.col("result_desc")) == ""),
                F.lit("No Entry")
            ).otherwise(F.col("result_desc"))
         )
    )

    # 3) Violation count per inspection
    w_viols = Window.partitionBy("inspection_id_src")
    g = g.withColumn("Violation_Count", F.count(F.lit(1)).over(w_viols))

    # Re-alias AFTER all withColumn calls
    g = g.alias("g")

    # 4) Load dims
    dim_date     = dim("dim_date").alias("dd")
    dim_loc      = dim("dim_location").alias("dl")
    dim_risk     = dim("dim_risk_category").alias("dr")
    dim_it       = dim("dim_inspection_type").alias("dit")
    dim_result   = dim("dim_inspection_result").alias("dir")
    dim_viol     = dim("dim_violation").alias("dv")
    dim_business = dim("dim_business").alias("db")

    # 5) Join dims

    # Business: join on Business_Natural_Key + current version only
    fact = (
        g.join(
            dim_business,
            (F.col("g.Business_Natural_Key") == F.col("db.Business_Natural_Key")) &
            (F.col("db.Is_Active") == F.lit(1)),
            "left"
        )
        .join(
            dim_date,
            F.col("g.inspection_date") == F.col("dd.Full_Date"),
            "left"
        )
        .join(
            dim_loc,
            (F.col("g.location_address") == F.col("dl.Street_Number")) &
            (F.col("g.src_city")         == F.col("dl.City")) &
            (F.col("g.state")            == F.col("dl.State")) &
            (F.col("g.zip_code")         == F.col("dl.Zip_Code")),
            "left"
        )
        .join(
            dim_risk,
            (F.col("g.risk_category")  == F.col("dr.Risk_Desc")) &
            (F.col("g.risk_level_num") == F.col("dr.Risk_Level_Num")),
            "left"
        )
        .join(
            dim_it,
            F.col("g.inspection_type") == F.col("dit.Inspection_Type"),
            "left"
        )
        .join(
            dim_result,
            F.upper(F.trim(F.col("g.result_desc_norm"))) ==
            F.upper(F.trim(F.col("dir.Result_Desc"))),
            "left"
        )
        .join(
            dim_viol,
            F.col("g.violation_code").cast("int") == F.col("dv.Violation_Code"),
            "left"
        )
    )

    # 6) Inspection surrogate key for the fact (inspection × violation grain)
    w_key = Window.orderBy("g.inspection_id_src", "dv.Violation_Key")
    fact = fact.withColumn("Inspection_Key", F.row_number().over(w_key))

    # 7) Final projection
    return fact.select(
        F.col("Inspection_Key"),
        F.col("g.inspection_id_src").alias("Inspection_Id"),
        F.col("db.Business_Key").alias("Business_Key"),
        F.col("dl.Location_Key").alias("Location_Key"),
        F.col("dd.Date_Key").alias("Date_Key"),
        F.col("dv.Violation_Key").alias("Violation_Key"),
        F.col("dit.Inspection_Type_Key").alias("Inspection_Type_Key"),
        F.col("dir.Result_Key").alias("Inspection_Result_Key"),
        F.col("dr.Risk_Category_Key").alias("Risk_Category_Key"),
        F.col("g.score").cast("int").alias("Inspection_Score"),
        F.col("Violation_Count").cast("int").alias("Violation_Count"),
        F.col("g.violation_comments").alias("Inspector_Comments"),
        F.lit("FOOD_INSPECTIONS").alias("Source_System")
    )