In [2]:
lakehouse_silverTable = "abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables"
spark.conf.set("spark.executorEnv.lakehouse_silverTable", lakehouse_silverTable)
lakehouse_silver_table = spark.conf.get("spark.executorEnv.lakehouse_silverTable")
print(lakehouse_silver_table)


StatementMeta(, 8277413f-ff5e-4bc5-9b8a-496359327ab2, 4, Finished, Available, Finished)

abfss://Fabric_E2E@onelake.dfs.fabric.microsoft.com/Lakehouse_Silver_.Lakehouse/Tables


In [3]:
try:
    from pyspark.sql.functions import to_timestamp, date_format, col, datediff, row_number, lag, when
    from pyspark.sql.window import Window

    # Load encounters.csv
    enc_df = spark.read.option("header", "true").csv("Files/raw/encounters.csv")

    # Drop rows with critical nulls
    enc_df = enc_df.dropna(subset=['Id', 'PATIENT', 'START', 'STOP', 'ENCOUNTERCLASS'])

    # Rename and transform columns
    admit_df = enc_df \
        .withColumnRenamed("Id", "DIM_EncounterId") \
        .withColumnRenamed("PATIENT", "DIM_patientId") \
        .withColumnRenamed("ORGANIZATION", "DIM_providerId") \
        .withColumn("START_TS", to_timestamp("START", "yyyy-MM-dd'T'HH:mm:ssX")) \
        .withColumn("STOP_TS", to_timestamp("STOP", "yyyy-MM-dd'T'HH:mm:ssX")) \
        .withColumn("DIM_DateId", date_format("START_TS", "yyyyMMdd"))

    # Calculate length of stay (in days)
    admit_df = admit_df.withColumn("length_of_stay", datediff("STOP_TS", "START_TS"))

    # Window spec by patient ordered by admission start time
    readmit_window = Window.partitionBy("DIM_patientId").orderBy("START_TS")

    # Get previous discharge time for each patient
    admit_df = admit_df.withColumn("prev_discharge", lag("STOP_TS").over(readmit_window))

    # Add readmitted_flag (1 if readmitted within 30 days, else 0)
    admit_df = admit_df.withColumn(
        "readmitted_flag",
        when(
            (datediff(col("START_TS"), col("prev_discharge")) <= 30) &
            (datediff(col("START_TS"), col("prev_discharge")) > 0),
            1
        ).otherwise(0)
    )

    # Add surrogate key
    surrogate_key_window = Window.orderBy("DIM_patientId", "DIM_DateId")
    admit_df = admit_df.withColumn("Fact_AdmissionId", row_number().over(surrogate_key_window))

    # Final column selection
    fact_admissions = admit_df.select(
        "Fact_AdmissionId",
        "DIM_EncounterId",
        "DIM_patientId",
        "DIM_providerId",
        "DIM_DateId",
        col("ENCOUNTERCLASS").alias("admission_type"),
        "length_of_stay",
        "readmitted_flag",
        col("DESCRIPTION").alias("discharge_disposition")
    )

    # Save to Silver layer
    fact_admissions.write \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .format("delta") \
        .save(f"{lakehouse_silver_table}/FactAdmissions")

except Exception as e:
    print(f"❌ Notebook 'Fact_Admissions' failed: {str(e)} — Skipping to next item in pipeline.")

StatementMeta(, 8277413f-ff5e-4bc5-9b8a-496359327ab2, 5, Finished, Available, Finished)