In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Windowing").getOrCreate()

25/07/11 10:23:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark

In [4]:
df_kafka = spark.readStream.format('kafka')\
            .option('kafka.bootstrap.servers','kafka:9092')\
            .option('subscribe', 'windowFunctions')\
            .option('startingOffsets', 'latest')\
            .load()

In [5]:
df_kafka.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
from pyspark.sql.functions import col, cast, from_json

In [7]:

s = "event_time STRING, data STRING"

df = df_kafka.select(col('value').cast('STRING').alias('JSON'))\
    .select(from_json(col('JSON'),s).alias('values'))\
    .select('values.*')

In [8]:
df.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- data: string (nullable = true)



In [9]:
from pyspark.sql.functions import split, explode

df_words = df.withColumn('words', split('data', ' '))\
            .withColumn('word', explode('words')).drop('words','data')\
            .withColumn('event_time', col('event_time').cast('TIMESTAMP'))

In [10]:
df_words.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- word: string (nullable = false)



In [11]:
from pyspark.sql.functions import window, lit
import pyspark.sql.functions as F
df_agg = df_words.withWatermark("event_time","10 minutes")\
                .groupBy(window("event_time","10 minutes"),'word')\
                .agg(F.sum(lit(1)).alias('cnt'))

In [12]:
df_agg.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- word: string (nullable = false)
 |-- cnt: long (nullable = true)



In [13]:
df_final = df_agg.select(col('window.start').alias('start_window')\
                        ,col('window.end').alias('end_window'),'word','cnt')

In [14]:
df_final.printSchema()

root
 |-- start_window: timestamp (nullable = true)
 |-- end_window: timestamp (nullable = true)
 |-- word: string (nullable = false)
 |-- cnt: long (nullable = true)



In [15]:
df_writer_complete = df_final.writeStream.format('console')\
            .outputMode('complete')\
            .trigger(processingTime = '20 seconds')

In [16]:
df_writer_update = df_final.writeStream.format('console')\
            .outputMode('update')\
            .trigger(processingTime = '20 seconds')

In [17]:
query_complete = df_writer_complete.start()
query_update = df_writer_update.start()

25/07/11 10:23:33 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3f41e405-417d-41f7-9353-b1748fd1138b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/11 10:23:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/07/11 10:23:33 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-351404e9-b5e3-4f1b-bb21-19945e04de85. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/11 10:23:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

-------------------------------------------
Batch: 0
-------------------------------------------
+------------+----------+----+---+
|start_window|end_window|word|cnt|
+------------+----------+----+---+
+------------+----------+----+---+



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------+----------+----+---+
|start_window|end_window|word|cnt|
+------------+----------+----+---+
+------------+----------+----+---+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------


                                                                                

+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  2|
+-------------------+-------------------+----+---+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  2|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  3|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  3|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------+----------+----+---+
|start_window|end_window|word|cnt|
+------------+----------+----+---+
+------------+----------+----+---+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  4|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  4|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| owl|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  4|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:10:00|2024-04-09 12:20:00| owl|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| dog|  1|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------+----------+----+---+
|start_window|end_window|word|cnt|
+------------+----------+----+---+
+------------+----------+----+---+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------+----------+----+---+
|start_window|end_window|word|cnt|
+------------+----------+----+---+
+------------+----------+----+---+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
|2024-04-09 11:00:00|2024-04-09 11:10:00| dog|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| owl|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  4|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  2|
+-------------------+-------------------+----+---+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+-------------------+----+---+
|       start_window|         end_window|word|cnt|
+-------------------+-------------------+----+---+
|2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  2|
|2024-04-09 11:00:00|2024-04-09 11:10:00| dog|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| owl|  1|
|2024-04-09 12:10:00|2024-04-09 12:20:00| dog|  1|
|2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  4|
+-------------------+-------------------+----+---+



In [18]:
query_complete.stop()

query_update.stop()