In [1]:
lakehouse_silverTable = "abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables"
spark.conf.set("spark.executorEnv.lakehouse_silverTable", lakehouse_silverTable)
lakehouse_silver_table = spark.conf.get("spark.executorEnv.lakehouse_silverTable")
print(lakehouse_silver_table)


StatementMeta(, 27a03adf-c6cf-4424-a992-7879b19beb3f, 3, Finished, Available, Finished)

abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables


In [2]:
try:
    from pyspark.sql.functions import to_timestamp, date_format, unix_timestamp, row_number, col
    from pyspark.sql.window import Window

    # Load procedures.csv
    proc_df = spark.read.option("header", "true").csv("Files/raw/procedures.csv")

    # Drop rows missing critical data
    proc_df = proc_df.dropna(subset=['PATIENT', 'ENCOUNTER', 'START', 'STOP', 'BASE_COST'])

    # Convert timestamps
    proc_df = proc_df \
        .withColumn("START_TS", to_timestamp("START", "yyyy-MM-dd'T'HH:mm:ssX")) \
        .withColumn("STOP_TS", to_timestamp("STOP", "yyyy-MM-dd'T'HH:mm:ssX")) \
        .withColumn("DIM_DateId", date_format("START_TS", "yyyyMMdd"))

    # Compute procedure duration in minutes
    proc_df = proc_df.withColumn(
        "duration_minutes",
        (unix_timestamp("STOP_TS") - unix_timestamp("START_TS")) / 60
    )

    # Add surrogate key using row_number
    windowSpec = Window.orderBy("PATIENT", "ENCOUNTER", "START_TS")
    proc_df = proc_df.withColumn("Fact_ProcedureId", row_number().over(windowSpec))

    # Rename and prepare final columns
    fact_procedures = proc_df.select(
        "Fact_ProcedureId",
        col("ENCOUNTER").alias("DIM_EncounterId"),
        col("PATIENT").alias("DIM_patientId"),
        col("CODE").alias("DIM_ProcedureCode"),
        col("DESCRIPTION").alias("outcome"),
        "DIM_DateId",
        col("BASE_COST").alias("procedure_cost"),
        "duration_minutes"
    )

    # Write to Silver layer
    fact_procedures.write \
        .mode("overwrite") \
        .format("delta") \
        .save(f"{lakehouse_silver_table}/FactProcedures")

except Exception as e:
    print(f"❌ Notebook 'Fact_Procedures' failed: {str(e)} — Skipping to next item in pipeline.")

StatementMeta(, 27a03adf-c6cf-4424-a992-7879b19beb3f, 4, Finished, Available, Finished)