**Chapter 6. Spark Streaming.**


**1. Basic Sources.**

1.1.  TCP/IP Sockets

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    // Create the context with a 5 seconds batch size
    val ssc = new StreamingContext(sc, Seconds(5)) // Read the forlder every 5 seconds

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    lines.print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.14:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675025622426)
SparkSession available as 'spark'


23/01/29 21:53:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is listening on port 9999 and ready...
23/01/29 21:53:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/29 21:53:47 WARN BlockManager: Block input-0-1675025626800 replicated to only 0 peer(s) instead of 1 peers
23/01/29 21:53:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/29 21:53:47 WARN BlockManager: Block input-0-1675025627000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675025630000 ms
-------------------------------------------
1009,Julia,20,DNeuro,01-09-2022
1010,Javier,30,DEndo,01-09-2022
1011,Laura,50,DGineco,01-09-2022
1012,Nuria,10,DCardio,01-09-2022
1013,Helena,10,DCardio,01-09-2022
1014,Nati,10,DCardio,01-09-2022

-------------------------------------------
Time: 1675025635000 ms
-------------------------------------------


**Chapter 6. Improving our Data Analytics with Spark Streaming Transformations**

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5))

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    lines.flatMap(_.split(",")).print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.14:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675203472376)
SparkSession available as 'spark'


23/01/31 23:17:55 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is listening on port 9999 and ready...
23/01/31 23:17:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/31 23:17:56 WARN BlockManager: Block input-0-1675203476600 replicated to only 0 peer(s) instead of 1 peers
23/01/31 23:17:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/31 23:17:57 WARN BlockManager: Block input-0-1675203477600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675203480000 ms
-------------------------------------------
1009
Julia
20
DNeuro
01-09-2022
1010
Javier
30
DEndo
01-09-2022
...

-------------------------------------------
Time: 1675203485000 ms
-------------------------------------------

-------------------------------------------
Time: 1675203490000 ms
-------------------------------------------

---------------

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5))

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    lines.flatMap(_.split(",")).count().print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }


 Spark is listening on port 9999 and ready...
-------------------------------------------
Time: 1675204460000 ms
-------------------------------------------
0

23/01/31 23:34:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/31 23:34:21 WARN BlockManager: Block input-0-1675204461200 replicated to only 0 peer(s) instead of 1 peers
23/01/31 23:34:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/31 23:34:21 WARN BlockManager: Block input-0-1675204461600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675204465000 ms
-------------------------------------------
75

-------------------------------------------
Time: 1675204470000 ms
-------------------------------------------
0

23/01/31 23:34:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/01/31 23:34:31 WARN BlockManager: Block input-0-1675204471000 replicated to only 0 peer(s) instead 

* countByValue() to count the number of times each word occurs.

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[2]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5)) 

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    lines.countByValue().print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5)) 

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    lines.flatMap(_.split(",")).countByValue().print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.14:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675236815329)
SparkSession available as 'spark'


23/02/01 08:33:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is listening on port 9999 and ready...
-------------------------------------------
Time: 1675236820000 ms
-------------------------------------------

23/02/01 08:33:40 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 08:33:40 WARN BlockManager: Block input-0-1675236820400 replicated to only 0 peer(s) instead of 1 peers
23/02/01 08:33:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 08:33:41 WARN BlockManager: Block input-0-1675236821000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675236825000 ms
-------------------------------------------
(01-09-202,1)
(01-09-2022,20)
(1007,5)
(1008,5)
(50,4)
(DCardio,16)
(Lorena,4)
(Tomás,1)
(Marina,5)
(Ester,5)
...

23/02/01 08:33:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with on

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5)) 

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")

    val words = lines.flatMap(_.split(",")).map(x => (x, 1)).reduceByKey(_+_)
    words.print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.10.45:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675241765921)
SparkSession available as 'spark'


23/02/01 09:56:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is listening on port 9999 and ready...
-------------------------------------------
Time: 1675241770000 ms
-------------------------------------------

23/02/01 09:56:14 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 09:56:14 WARN BlockManager: Block input-0-1675241774400 replicated to only 0 peer(s) instead of 1 peers
23/02/01 09:56:14 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 09:56:14 WARN BlockManager: Block input-0-1675241774600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675241775000 ms
-------------------------------------------
(01-09-202,2)
(01-09-2022,22)
(1007,6)
(1008,5)
(50,5)
(DCardio,19)
(Lorena,5)
(Marina,5)
(Ester,6)
(DGineco,5)
...

23/02/01 09:56:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with 

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val host = "localhost"
val port = 9999

try{
    val spark = SparkSession
        .builder()
        .master("local[*]")
        .appName("Hands-On_Spark3_socketTextStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5)) 

    val lines = ssc.socketTextStream(host, port)
    
    printf("\n Spark is listening on port 9999 and ready...\n")
    
    
    val filterHeaders = lines.filter(!_.matches("[^0-9]+"))
    val selectedRecords = filterHeaders.map{ row =>
        val rowArray = row.split(",")
        (rowArray(3))
    }
    selectedRecords.map(x => (x, 1)).reduceByKey(_+_).print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.14:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675284919607)
SparkSession available as 'spark'


23/02/01 21:55:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is listening on port 9999 and ready...
23/02/01 21:55:23 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 21:55:23 WARN BlockManager: Block input-0-1675284923000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675284925000 ms
-------------------------------------------
(DCardio,3)
(DGineco,1)
(DEndo,1)

-------------------------------------------
Time: 1675284930000 ms
-------------------------------------------

-------------------------------------------
Time: 1675284935000 ms
-------------------------------------------

23/02/01 21:55:36 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/01 21:55:36 WARN BlockManager: Block input-0-1675284936200 replicated to only 0 peer(s) instead of 1 peers
23/02/01 21:55:37 WARN RandomBlockReplicationPoli

**Chapter 6. Spark File Streaming**

In [None]:


import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.IOException

val folder="/tmp/patient_streaming"

try{
    val spark = SparkSession
        .builder()
        .master("local[1]")
        .appName("Hand-On-Spark3_textFileStream")
        .getOrCreate()

    val sc = spark.sparkContext

    val ssc = new StreamingContext(sc, Seconds(5))
    
    val lines = ssc.textFileStream(folder)
    
    printf(f"\n Spark is monitoring folder $folder%s and ready... \n")
    
    
    val filterHeaders = lines.filter(!_.matches("[^0-9]+"))
    val selectedRecords = filterHeaders.map{ row =>
        val rowArray = row.split(",")
        (rowArray(3))
    }
    selectedRecords.map(x => (x, 1)).reduceByKey(_+_).print()
    ssc.start()
    ssc.awaitTermination()
}catch {
      case e: IOException => println("IOException occurred")
      case t: Throwable => println("Error receiving data", t)
    } finally {
      println("Finally block")
    }

23/02/03 18:57:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Spark is monitoring folder /tmp/patient_streaming and ready... 
-------------------------------------------
Time: 1675447065000 ms
-------------------------------------------
(DCardio,1)

-------------------------------------------
Time: 1675447070000 ms
-------------------------------------------
(DEndo,1)
(DNeuro,1)

-------------------------------------------
Time: 1675447075000 ms
-------------------------------------------
(DGastro,1)
(DCardio,3)
(DGineco,1)
(DNeuro,2)

-------------------------------------------
Time: 1675447080000 ms
-------------------------------------------

-------------------------------------------
Time: 1675447085000 ms
-------------------------------------------

-------------------------------------------
Time: 1675447090000 ms
-------------------------------------------



**Chapter 6. Spark Streaming Graceful Shutdown**

In [1]:

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

val spark = SparkSession
    .builder()
    .master("local[3]")
    .appName("streamingGracefulShutdown")
    .config("spark.streaming.stopGracefullyOnShutdown", true)
    .getOrCreate()

import spark.implicits._

val sc = spark.sparkContext

val ssc = new StreamingContext(sc, Seconds(5))

val host = "localhost"
val port = 9999

val altFolder = "/tmp/alt_folder"
var stopFlag:Boolean = false

val lines = ssc.socketTextStream(host, port)

val groupedRecords =lines.map(record => 
                              {
                                  val arrayRecords=record.split(",")
                                  (arrayRecords(3))
                              }
                             )
groupedRecords.countByValue().print()

val words = lines.flatMap(_.split(","))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_+_)
wordCounts.print()

ssc.start()

val timeout = 10000 
var wasStopped = false

while (! wasStopped) {
    printf("\n Listening and ready... \n")

    wasStopped = ssc.awaitTerminationOrTimeout(timeout)
    
    if (wasStopped)
        println("Streaming process is no longer active...")
    else
        println("Streaming is in progress...")
    
    // Check the existence of altFolder, /tmp/alt_folder
    if (!stopFlag) {
      val fs = FileSystem.get(new Configuration())
      stopFlag = fs.exists(new Path(altFolder))
    }
    
    if (!wasStopped && stopFlag) {
        println("Stopping ssc context")
        ssc.stop(stopSparkContext = true, stopGracefully = true)
        println("ssc context has been stopped!")
    }
}

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.14:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1675631790116)
SparkSession available as 'spark'


23/02/05 22:16:33 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

 Listening and ready... 
23/02/05 22:16:34 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/05 22:16:34 WARN BlockManager: Block input-0-1675631794400 replicated to only 0 peer(s) instead of 1 peers
23/02/05 22:16:34 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/02/05 22:16:34 WARN BlockManager: Block input-0-1675631794600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1675631795000 ms
-------------------------------------------
(DCardio,12)
(DGineco,4)
(DEndo,4)
(DNeuro,2)

-------------------------------------------
Time: 1675631795000 ms
-------------------------------------------
(01-09-2022,22)
(1007,2)
(1008,2)
(Laura,2)
(Julia,2)
(50,4)
(Nuria,2)
(1009,2)
(DCardio,12)
(Javier,2)
...

-------------------------------------------
Time: 167563180000

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5ff5786f
import spark.implicits._
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@42a949e7
ssc: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@500effb8
host: String = localhost
port: Int = 9999
altFolder: String = /tmp/alt_folder
stopFlag: Boolean = true
lines: org.apache.spark.streaming.dstream.ReceiverInputDStream[String] = org.apache.spark.streaming.dstream.SocketInputDStream@7cf7...


In [None]:

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

val spark = SparkSession
    .builder()
    .master("local[*]")
    .appName("streaming")
    .config("spark.streaming.stopGracefullyOnShutdown", true)
    .getOrCreate()

import spark.implicits._

val sc = spark.sparkContext

val ssc = new StreamingContext(sc, Seconds(5))

val host = "localhost"
val port = 9999

val altFolder = "/tmp/alt_folder"
var stopFlag:Boolean = false

val lines = ssc.socketTextStream(host, port)

val groupedRecords =lines.map(record => 
                              {
                                  val arrayRecords=record.split(",")
                                  (arrayRecords(3))
                              }
                             )
groupedRecords.countByValue().print()

val words = lines.flatMap(_.split(","))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_+_)
wordCounts.print()

ssc.start()

val timeout = 10000
var isStopped = false

while (! isStopped) {
    printf("\n Listening and ready... \n")

    isStopped = ssc.awaitTerminationOrTimeout(timeout)
    
    if (isStopped)
        println("Stopped streaming. Leaving the application...")
    else
        println("Streaming App is still running. Timeout...")
    
    // Check the existence of altFolder, /tmp/alt_folder
    if (!stopFlag) {
      val fs = FileSystem.get(new Configuration())
      stopFlag = fs.exists(new Path(altFolder))
    }
    
    if (!isStopped && stopFlag) {
        println("stopping ssc right now")
        ssc.stop(stopSparkContext = true, stopGracefully = true)
        println("ssc is stopped!!!!!!!")
    }
}