#### Chapter 9. Event_Time_Window_Operations_and_Watermarking

In [None]:

// Tumbling windows 

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._


try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS
         .groupBy(window(col("Fecha"), "10 seconds"))
         .agg(count("DNom").alias("Suma_x_Dpt"))
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


Intitializing Scala interpreter ...

Spark Web UI available at http://tucan:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1679606213124)
SparkSession available as 'spark'


23/03/23 22:17:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Listening and ready... 
-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+
|window                                    |Suma_x_Dpt|
+------------------------------------------+----------+
|{2023-02-23 01:00:00, 2023-02-23 01:00:10}|1         |
+------------------------------------------+----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+
|window                                    |Suma_x_Dpt|
+------------------------------------------+----------+
|{2023-02-23 01:00:00, 2023-02-23 01:00:10}|3         |
+------------------------------------------+----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+

In [None]:
// Tumbling windows II

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._


try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS
        .groupBy(window(col("Fecha"), "10 seconds"))
        .agg(count("DNom").alias("Suma_x_Dpt"))
        .select("window.start", "window.end", "Suma_x_Dpt")
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}



 Listening and ready... 
-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-------------------+----------+
|start              |end                |Suma_x_Dpt|
+-------------------+-------------------+----------+
|2023-02-23 01:00:00|2023-02-23 01:00:10|1         |
+-------------------+-------------------+----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+----------+
|start              |end                |Suma_x_Dpt|
+-------------------+-------------------+----------+
|2023-02-23 01:00:00|2023-02-23 01:00:10|3         |
+-------------------+-------------------+----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+-------------------+----------+
|start              |end                |Suma_x_Dpt|
+-------------------+-------------------+----

In [None]:

// Sliding Windows

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS
         .groupBy(window(col("Fecha"), "10 seconds", "5 seconds"))
         .agg(count("DID").alias("Suma_x_Dpt"))
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


Intitializing Scala interpreter ...

Spark Web UI available at http://tucan:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1679653303380)
SparkSession available as 'spark'


23/03/24 11:21:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Listening and ready... 
-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+
|window                                    |Suma_x_Dpt|
+------------------------------------------+----------+
|{2023-02-23 01:00:25, 2023-02-23 01:00:35}|5         |
|{2023-02-23 01:00:35, 2023-02-23 01:00:45}|4         |
|{2023-02-23 01:00:30, 2023-02-23 01:00:40}|9         |
+------------------------------------------+----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+
|window                                    |Suma_x_Dpt|
+------------------------------------------+----------+
|{2023-02-23 01:00:25, 2023-02-23 01:00:35}|10        |
|{2023-02-23 01:00:20, 2023-02-23 01:00:30}

In [None]:

// Sliding Windows II

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )
val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS
        .groupBy(window(col("Fecha"), "10 seconds", "5 seconds"))
        .agg(count("DID").alias("Suma_x_Dpt"))
        .select("window.start", "window.end", "Suma_x_Dpt")
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}



 Listening and ready... 
-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-------------------+----------+
|start              |end                |Suma_x_Dpt|
+-------------------+-------------------+----------+
|2023-02-23 01:00:25|2023-02-23 01:00:35|5         |
|2023-02-23 01:00:35|2023-02-23 01:00:45|4         |
|2023-02-23 01:00:30|2023-02-23 01:00:40|9         |
+-------------------+-------------------+----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+----------+
|start              |end                |Suma_x_Dpt|
+-------------------+-------------------+----------+
|2023-02-23 01:00:25|2023-02-23 01:00:35|10        |
|2023-02-23 01:00:20|2023-02-23 01:00:30|8         |
|2023-02-23 01:00:35|2023-02-23 01:00:45|4         |
|2023-02-23 01:00:30|2023-02-23 01:00:40|9         |
|2023-02-23 01:00:15|2023-02-23 01:00

In [None]:

// Session Windows I

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )
val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    PatientDS.printSchema()
    
    printf("\n Listening and ready... \n")

    val PatientDF = PatientDS
        .groupBy(
            session_window(col("Fecha"), "10 seconds"), col("DID")
        ).count()
    
    PatientDF.printSchema()
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


Intitializing Scala interpreter ...

Spark Web UI available at http://tucan:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1679673766384)
SparkSession available as 'spark'


23/03/24 17:02:55 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
root
 |-- NSS: string (nullable = true)
 |-- Nom: string (nullable = true)
 |-- DID: integer (nullable = true)
 |-- DNom: string (nullable = true)
 |-- Fecha: string (nullable = true)


 Listening and ready... 
root
 |-- session_window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- DID: integer (nullable = true)
 |-- count: long (nullable = false)

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------------------------------------+---+-----+
|session_window                                    |DID|count|
+--------------------------------------------------+---+-----+
|{2023-02-23 01:00:15.002, 2023-02-23 01:00:25.002}|20 |1    |
|{2023-02-23 01:00:18.002, 2023-02-23 01:00:31.002}|10 |4    |
|{2023-02-23 01:00:17.002, 2023-02-23 01:00:

In [None]:
// Session window with dynamic gap duration

// Session Windows II

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )
val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
    
    PatientDS.printSchema()
    
    printf("\n Listening and ready... \n")

    val PatientDF = PatientDS
        .groupBy(
            session_window(col("Fecha"), 
                           when(col("NSS") === "1009", "10 seconds")
                           .when(col("NSS") === "2001", "30 seconds")
                           .when(col("NSS") === "5000", "50 seconds")
                           .otherwise("60 seconds")),
            col("DID")
        ).count()
    
    PatientDF.printSchema()
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


Intitializing Scala interpreter ...

Spark Web UI available at http://tucan:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1679677806552)
SparkSession available as 'spark'


23/03/24 18:10:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
root
 |-- NSS: string (nullable = true)
 |-- Nom: string (nullable = true)
 |-- DID: integer (nullable = true)
 |-- DNom: string (nullable = true)
 |-- Fecha: string (nullable = true)


 Listening and ready... 
root
 |-- session_window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- DID: integer (nullable = true)
 |-- count: long (nullable = false)

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------------------------------------+---+-----+
|session_window                                    |DID|count|
+--------------------------------------------------+---+-----+
|{2023-02-23 01:00:15.002, 2023-02-23 01:00:25.002}|20 |1    |
|{2023-02-23 01:00:18.002, 2023-02-23 01:01:20.002}|10 |3    |
|{2023-02-23 01:00:17.002, 2023-02-23 01:01:

In [None]:

// Watermarking in Spark Structured Streaming

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )
val spark:SparkSession = SparkSession.builder()
    .master("local[10]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

try {
    val PatientDS = spark.readStream
        .schema(PatientsSchema)
        .json("/tmp/window")
        .withColumn("Fecha", to_timestamp(col("Fecha"), "yyyy-MM-dd'T'HH:mm:ss.SSSX"))
    
    PatientDS.printSchema()
    
    printf("\n Listening and ready... \n")

    val PatientDF = PatientDS
        .withWatermark("Fecha", "30 seconds")
        .groupBy(
            session_window(col("Fecha"), "10 seconds"), col("DID")
        ).count()
    
    PatientDF.printSchema()
    
    PatientDF.writeStream
        .outputMode("complete")
        .format("console")
        .option("truncate", false)
        .start()
        .awaitTermination()
    
} catch {
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


root
 |-- NSS: string (nullable = true)
 |-- Nom: string (nullable = true)
 |-- DID: integer (nullable = true)
 |-- DNom: string (nullable = true)
 |-- Fecha: timestamp (nullable = true)


 Listening and ready... 
root
 |-- session_window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- DID: integer (nullable = true)
 |-- count: long (nullable = false)

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------------------------------------+---+-----+
|session_window                                    |DID|count|
+--------------------------------------------------+---+-----+
|{2023-02-23 01:01:34.002, 2023-02-23 01:01:44.002}|20 |1    |
|{2023-02-23 01:02:00.002, 2023-02-23 01:02:10.002}|20 |1    |
|{2023-02-23 01:02:05.002, 2023-02-23 01:02:15.002}|10 |1    |
|{2023-02-23 01:02:20.002, 2023-02-23 01:02:30.002}|50 |1    |
|{2023-02-23 01:02:38.002, 2023-02-