In [0]:
from pyspark.sql.functions import sha2, col, current_timestamp, monotonically_increasing_id

In [0]:
%sql

create schema if not exists rwd.silver;

In [0]:
%sql

select * from rwd.bronze.tnm_stage_group_raw;
     

Patient_ID,TNM_Stage_Group,Tumor_Size_cm,Node_Involvement,Metastasis
P001,T2N1M0,3.5,Yes,No
P002,T1N0M0,1.2,No,No
P003,T3N2M0,5.0,Yes,No
P004,T2N0M0,3.0,No,No
P005,T3N1M1,6.2,Yes,Yes
P006,T1N0M0,2.0,No,No
P007,T4N2M1,7.5,Yes,Yes
P008,T2N1M0,3.8,Yes,No
P009,T3N1M0,4.5,Yes,No
P010,T1N0M0,2.1,No,No


In [0]:
bronze_table = 'rwd.bronze.tnm_stage_group_raw'
silver_table = 'rwd.silver.tnm_stage_group'
checkpoint_path = "/Volumes/rwd/silver/my_volume/silver/tnm_stage_group/checkpoint/"

In [0]:
df_bronze = (
    spark.readStream.table(bronze_table)
)


In [0]:
df_silver_clean = (
    df_bronze
        .dropDuplicates(["Patient_ID"])
        .withColumn("load_timestamp", current_timestamp())
)


In [0]:
from delta.tables import DeltaTable

def merge_tnm_stage_group(batch_df, batch_id):
    if not spark.catalog.tableExists(silver_table):
        batch_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)
        return

    # Load Delta table by name and upsert
    tnm_stage_group = DeltaTable.forName(spark, silver_table)

    (tnm_stage_group.alias("t")
        .merge(
            batch_df.alias("s"),
            "t.Patient_ID = s.Patient_ID"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())



(
    df_silver_clean.writeStream
        .foreachBatch(merge_tnm_stage_group)
        .outputMode("update")
        .trigger(availableNow=True)
        .option("checkpointLocation", checkpoint_path)
        .start()
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff55c44b99a0>

In [0]:
%sql

select * from rwd.silver.tnm_stage_group;

Patient_ID,TNM_Stage_Group,Tumor_Size_cm,Node_Involvement,Metastasis
P005,T3N1M1,6.2,Yes,Yes
P007,T4N2M1,7.5,Yes,Yes
P003,T3N2M0,5.0,Yes,No
P009,T3N1M0,4.5,Yes,No
P001,T2N1M0,3.5,Yes,No
P008,T2N1M0,3.8,Yes,No
P010,T1N0M0,2.1,No,No
P006,T1N0M0,2.0,No,No
P002,T1N0M0,1.2,No,No
P004,T2N0M0,3.0,No,No
