Initial streaming dataframe

In [0]:
spark.sql("USE CATALOG catalog")
spark.sql("USE schema")


from pyspark.sql.functions import col, current_timestamp, window, sum as spark_sum  # import window function
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
   StructField("Id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("money", IntegerType(), True),
   StructField("sales", IntegerType(), True),
   StructField("units", IntegerType(), True),
])

# Ingest data from JSON files
parsed_df = (spark.readStream
             .format("json")
             .option("multiLine", True)
             .schema(schema)
             .load("/Volumes/json/")
             .na.drop()
             .withColumn("money", col("money").cast("double"))
             .withColumn("event_time", current_timestamp())
             .filter(col("money") > 0)
             )

In [0]:
# normal stream dataframe
display(parsed_df)

Id,name,age,money,sales,units,event_time
1,Alice,25,1200.0,4500,45,2025-10-13T14:37:07.476Z
2,Bob,32,850.0,3000,30,2025-10-13T14:37:07.476Z
3,Charlie,29,1500.0,6000,60,2025-10-13T14:37:07.476Z
10,Julia,26,1300.0,2800,28,2025-10-13T14:39:00.375Z
11,Kevin,39,1750.0,5500,55,2025-10-13T14:39:00.375Z
12,Laura,31,1450.0,3800,38,2025-10-13T14:39:00.375Z
4,Diana,41,2100.0,7500,75,2025-10-13T14:39:00.375Z
5,Ethan,27,950.0,2500,25,2025-10-13T14:39:00.375Z
6,Fiona,34,1800.0,5000,50,2025-10-13T14:39:00.375Z
10,Julia,26,1300.0,2800,28,2025-10-13T14:39:00.375Z


In [0]:
# widowed dataframe
windowed_df = (parsed_df
                .withWatermark(eventTime = "event_time", delayThreshold = "10 seconds")
                .groupBy(window(timeColumn="event_time", 
                                windowDuration="10 seconds"), "name")
                .agg(spark_sum("money").alias("total_money"))
)

In [0]:
display(windowed_df)

window,name,total_money
"List(2025-10-13T14:39:00Z, 2025-10-13T14:39:10Z)",Charlie,1500.0
"List(2025-10-13T14:39:00Z, 2025-10-13T14:39:10Z)",Julia,3900.0
"List(2025-10-13T14:39:00Z, 2025-10-13T14:39:10Z)",Ian,4000.0
"List(2025-10-13T14:43:50Z, 2025-10-13T14:44:00Z)",Ethan,950.0
"List(2025-10-13T14:37:00Z, 2025-10-13T14:37:10Z)",Charlie,1500.0
"List(2025-10-13T14:41:50Z, 2025-10-13T14:42:00Z)",Diana,4200.0
"List(2025-10-13T14:39:00Z, 2025-10-13T14:39:10Z)",Ethan,1900.0
"List(2025-10-13T14:41:50Z, 2025-10-13T14:42:00Z)",Fiona,3600.0
"List(2025-10-13T14:37:00Z, 2025-10-13T14:37:10Z)",Alice,1200.0
"List(2025-10-13T14:39:00Z, 2025-10-13T14:39:10Z)",Kevin,5250.0


Write streaming results in append mode (option1)

In [0]:
# widowed query
checkpoint_path = "/Volumes/checkpoints_spark_query/window1"

windowed_query = (windowed_df
                    .writeStream
                    .format("delta")
                    .queryName("windowed_query")
                    .option("checkpointLocation", checkpoint_path)
                    .outputMode("append")
                    .trigger(availableNow=True)
                    .toTable("salesspark_window")
                    )

In [0]:
%sql
SELECT * FROM salesspark_window

window,name,total_money
"List(2025-10-13T14:38:00Z, 2025-10-13T14:38:10Z)",Charlie,1500.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Charlie,1500.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",George,2200.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Hannah,3200.0
"List(2025-10-13T14:38:00Z, 2025-10-13T14:38:10Z)",Alice,1200.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Julia,3900.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Alice,1200.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Diana,4200.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Laura,4350.0
"List(2025-10-13T14:39:50Z, 2025-10-13T14:40:00Z)",Fiona,3600.0


In [0]:
%sql
DESCRIBE HISTORY salesspark_window

Write streaming results in append mode (option2)

In [0]:
from pyspark.sql.types import TimestampType, DoubleType

schema = StructType([
    StructField("window", StructType([StructField("start", TimestampType(), True),
                                      StructField("end", TimestampType(), True)]), False),
    StructField("name", StringType(), True),
    StructField("total_money", DoubleType(), True)
])

empty_df = spark.createDataFrame([], schema=schema)

# creation of the table sink
empty_df.write.format("delta").saveAsTable("salesspark_window2", mode='overwrite')

In [0]:
# function to upsert microbatchoutput to table using merge
def upsertToTable(microBatchOutputDF, batchId):
    microBatchOutputDF.createOrReplaceTempView("updates")
    # in the same SparkSesison that has been used to define the updates dataframe
    spark.sql("""
              MERGE INTO salesspark_window2 t
              USING updates s
              ON t.window.start = s.window.start AND t.window.end = s.window.end AND t.name = s.name
              WHEN MATCHED THEN UPDATE SET *
              WHEN NOT MATCHED THEN INSERT *
              """)

In [0]:
%sql
SELECT * FROM salesspark_window2;


window,name,total_money


Here to write result table into delta table using update mode

In [0]:
checkpoint_path = "/Volumes/checkpoints_spark_query/window2"

windowed_query = (windowed_df
                    .writeStream
                    .format("delta")
                    .foreachBatch(upsertToTable)
                    .outputMode("update")
                    .queryName("windowed_query_update")
                    .option("checkpointLocation", checkpoint_path)
                    .trigger(availableNow=True)
                    .start()
                    )

In [0]:
%sql
SELECT * FROM salesspark_window2;

window,name,total_money
"List(2025-10-13T14:38:20Z, 2025-10-13T14:38:30Z)",Charlie,1500.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",Charlie,1500.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",George,2200.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",Hannah,3200.0
"List(2025-10-13T14:38:20Z, 2025-10-13T14:38:30Z)",Alice,1200.0
"List(2025-10-13T14:43:00Z, 2025-10-13T14:43:10Z)",Fiona,3600.0
"List(2025-10-13T14:43:00Z, 2025-10-13T14:43:10Z)",Diana,4200.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",Laura,4350.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",Julia,3900.0
"List(2025-10-13T14:40:10Z, 2025-10-13T14:40:20Z)",Kevin,5250.0


In [0]:
%sql
DESCRIBE HISTORY salesspark_window2;

In [0]:
for s in spark.streams.active:
    print(s.name)
    s.stop()

display_query_11
display_query_12
