In [1]:
lakehouse_silverTable = "abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables"
spark.conf.set("spark.executorEnv.lakehouse_silverTable", lakehouse_silverTable)
lakehouse_silver_table = spark.conf.get("spark.executorEnv.lakehouse_silverTable")
print(lakehouse_silver_table)


StatementMeta(, be922e7c-bbcd-4fbe-bb48-ee2e09f9ffb3, 3, Finished, Available, Finished)

abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables


In [2]:
try:
    from pyspark.sql.functions import to_timestamp, datediff, col, count, sum as sum_, date_format
    from pyspark.sql import functions as F

    # Load encounters
    enc_df = spark.read.option("header", "true").csv("Files/raw/encounters.csv")
    enc_df = enc_df.dropna(subset=['id', 'patient', 'start'])

    # Timestamp conversion & rename columns
    enc_transformed = enc_df \
        .withColumnRenamed('id', 'Fact_EncounterId') \
        .withColumnRenamed('patient', 'DIM_patientId') \
        .withColumnRenamed('organization', 'DIM_organizationId') \
        .withColumn("START", to_timestamp("start", "yyyy-MM-dd'T'HH:mm:ssX")) \
        .withColumn("STOP", to_timestamp("stop", "yyyy-MM-dd'T'HH:mm:ssX"))

    # Calculate length of stay and DIM_DateId
    enc_transformed = enc_transformed \
        .withColumn("length_of_stay", datediff("STOP", "START")) \
        .withColumn("DIM_DateId", date_format(col("START"), "yyyyMMdd"))

    # Load DIM_Procedures (already cleaned earlier)
    proc_df = spark.read.format("delta").load(f"{lakehouse_silver_table}/DIM_Procedures")

    # Group by procedure ID to compute count and cost
    proc_summary = proc_df.groupBy("DIM_ProcedureId").agg(
        count("*").alias("number_of_procedures"),
        sum_("base_cost").alias("visit_cost")
    )

    # Join encounters with summarized procedure info
    enc_proc = enc_transformed.join(
        proc_summary,
        enc_transformed.Fact_EncounterId == proc_summary.DIM_ProcedureId,
        "left"
    ).drop(proc_summary.DIM_ProcedureId)

    # Fill nulls where no procedures exist
    enc_proc = enc_proc.fillna({
        'number_of_procedures': 0,
        'visit_cost': 0.0
    })

    # Final fact table
    fact_encounter_final = enc_proc.select(
        "Fact_EncounterId",
        "DIM_patientId",
        "DIM_organizationId",
        "DIM_DateId",
        "length_of_stay",
        "number_of_procedures",
        "visit_cost"
    )

    # Save to Silver layer
    fact_encounter_final.write \
        .mode("overwrite") \
        .format("delta") \
        .save(f"{lakehouse_silver_table}/FactEncounterTest")

except Exception as e:
    print(f"❌ Notebook 'Fact_Encounter' failed: {str(e)} — Skipping to next item in pipeline.")

StatementMeta(, be922e7c-bbcd-4fbe-bb48-ee2e09f9ffb3, 4, Finished, Available, Finished)