In [0]:
from pyspark.sql.functions import sha2, col, current_timestamp, monotonically_increasing_id

In [0]:
%sql

create schema if not exists rwd.silver;

In [0]:
%sql

select * from rwd.bronze.observation_raw;
     

Patient_ID,Observation,Doctor,Observation_Date
P001,Elevated tumor markers,Dr. Menon,2026-01-15
P002,No detectable lesions,Dr. Kapoor,2025-12-20
P003,Increase in lymph node size,Dr. Reddy,2026-01-10
P004,Lesion unchanged over 6 months,Dr. Sharma,2026-01-05
P005,New hepatic metastasis,Dr. Iyer,2026-01-18
P006,Clear PET scan,Dr. Banerjee,2025-11-30
P007,Multiple pulmonary nodules,Dr. Thomas,2026-01-22
P008,Minimal residual disease,Dr. Gupta,2026-01-12
P009,Elevated CEA levels,Dr. Srinivasan,2026-01-08
P010,No recurrence after 2 years,Dr. Joshi,2025-12-25


In [0]:
bronze_table = 'rwd.bronze.observation_raw'
silver_table = 'rwd.silver.observation'
checkpoint_path = "/Volumes/rwd/silver/my_volume/silver/observation/checkpoint/"

In [0]:
df_bronze = (
    spark.readStream.table(bronze_table)
)


In [0]:
df_silver_clean = (
    df_bronze
        .dropDuplicates(["Patient_ID"])
        .withColumn("load_timestamp", current_timestamp())
)


In [0]:
from delta.tables import DeltaTable

def merge_observation(batch_df, batch_id):
    if not spark.catalog.tableExists(silver_table):
        batch_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)
        return

    # Load Delta table by name and upsert
    observation = DeltaTable.forName(spark, silver_table)

    (observation.alias("t")
        .merge(
            batch_df.alias("s"),
            "t.Patient_ID = s.Patient_ID"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())



(
    df_silver_clean.writeStream
        .foreachBatch(merge_observation)
        .outputMode("update")
        .trigger(availableNow=True)
        .option("checkpointLocation", checkpoint_path)
        .start()
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff60d843b0e0>

In [0]:
%sql

select * from rwd.silver.observation;

Patient_ID,Observation,Doctor,Observation_Date
P004,Lesion unchanged over 6 months,Dr. Sharma,2026-01-05
P007,Multiple pulmonary nodules,Dr. Thomas,2026-01-22
P003,Increase in lymph node size,Dr. Reddy,2026-01-10
P010,No recurrence after 2 years,Dr. Joshi,2025-12-25
P008,Minimal residual disease,Dr. Gupta,2026-01-12
P009,Elevated CEA levels,Dr. Srinivasan,2026-01-08
P002,No detectable lesions,Dr. Kapoor,2025-12-20
P001,Elevated tumor markers,Dr. Menon,2026-01-15
P005,New hepatic metastasis,Dr. Iyer,2026-01-18
P006,Clear PET scan,Dr. Banerjee,2025-11-30
