In [0]:
from pyspark.sql.functions import sha2, col, current_timestamp, monotonically_increasing_id

In [0]:
%sql

create schema if not exists rwd.silver;

In [0]:
%sql

select * from rwd.bronze.cancer_disease_status_raw;
     

Patient_ID,Cancer_Disease_Status,Last_Update,On_Treatment
P001,Active,2026-01-15,Yes
P002,Remission,2025-12-20,No
P003,Progressive,2026-01-10,Yes
P004,Stable,2026-01-05,Yes
P005,Active,2026-01-18,Yes
P006,Remission,2025-11-30,No
P007,Progressive,2026-01-22,Yes
P008,Stable,2026-01-12,Yes
P009,Active,2026-01-08,Yes
P010,Remission,2025-12-25,No


In [0]:
bronze_table = 'rwd.bronze.cancer_disease_status_raw'
silver_table = 'rwd.silver.cancer_disease_status'
checkpoint_path = "/Volumes/rwd/silver/my_volume/silver/cancer_disease_status/checkpoint/"

In [0]:
df_bronze = (
    spark.readStream.table(bronze_table)
)


In [0]:
df_silver_clean = (
    df_bronze
        .dropDuplicates(["Patient_ID"])
        .withColumn("load_timestamp", current_timestamp())
)


In [0]:
from delta.tables import DeltaTable

def merge_cancer_disease_status(batch_df, batch_id):
    if not spark.catalog.tableExists(silver_table):
        batch_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)
        return

    # Load Delta table by name and upsert
    cancer_disease_status = DeltaTable.forName(spark, silver_table)

    (cancer_disease_status.alias("t")
        .merge(
            batch_df.alias("s"),
            "t.Patient_ID = s.Patient_ID"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())



(
    df_silver_clean.writeStream
        .foreachBatch(merge_cancer_disease_status)
        .outputMode("update")
        .trigger(availableNow=True)
        .option("checkpointLocation", checkpoint_path)
        .start()
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff82fda0e0f0>

In [0]:
%sql

select * from rwd.silver.cancer_disease_status;

Patient_ID,Cancer_Disease_Status,Last_Update,On_Treatment
P007,Progressive,2026-01-22,Yes
P003,Progressive,2026-01-10,Yes
P010,Remission,2025-12-25,No
P006,Remission,2025-11-30,No
P002,Remission,2025-12-20,No
P004,Stable,2026-01-05,Yes
P008,Stable,2026-01-12,Yes
P009,Active,2026-01-08,Yes
P005,Active,2026-01-18,Yes
P001,Active,2026-01-15,Yes
