#### Reading and splitting strings

In [None]:
// Reading and splitting strings

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,DateType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException

import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import scala.concurrent.duration._
import org.apache.spark.sql.streaming._

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999

try {
    val PatientDS = spark
        .readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
    
    printf("\n Listening and ready... \n")

    val selectDF = PatientDS
    .select('value cast "string")
    .withColumn("fields", split('value, ","))
    .withColumn("NSS", 'fields(0) cast "string")
    .withColumn("Nom", 'fields(1) cast "string")
    .withColumn("DID", 'fields(2) cast "int")
    .withColumn("DNom", 'fields(3) cast "string")
    .withColumn("Fecha", to_timestamp('fields(4)))
    .select("DID","DNom","Fecha")
    
    selectDF.printSchema()

    val counts = selectDF
        .groupBy(window(col("Fecha"), "10 seconds"))
        .count()
    
    counts
        .writeStream
        .outputMode("update")
        .option("truncate", false)
        .option("numRows", 10)
        .format("console")
        .start()
        .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In final block")
}


#### Stateful operations single column. Reading and splitting Input Strings

In [None]:
// Stateful operations single column. Reading and splitting Input Strings

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,DateType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException

import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import scala.concurrent.duration._
import org.apache.spark.sql.streaming._

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999

try {
    val PatientDS = spark
        .readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
    
    printf("\n Listening and ready... \n")

    val selectDF = PatientDS
    .select('value cast "string")
    .withColumn("fields", split('value, ","))
    .withColumn("NSS", 'fields(0) cast "string")
    .withColumn("Nom", 'fields(1) cast "string")
    .withColumn("DID", 'fields(2) cast "int")
    .withColumn("DNom", 'fields(3) cast "string")
    .withColumn("Fecha", to_timestamp('fields(4)))
    .select("DID","DNom","Fecha")
    
    selectDF.printSchema()

    val counts = selectDF
        .groupBy(col("DID"))
        .count()
    
    counts.printSchema()
    
    counts
        .writeStream
        .outputMode("update")
        .option("truncate", false)
        .option("numRows", 10)
        .format("console")
        .start()
        .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In final block")
}


#### Stateful operations multiple columns. With JSON Format and User Defined Schema

In [None]:
// Stateful operations multiple columns

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException

import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")

    //val selectDF = PatientDS.select("DNom")
    
    val counts = PatientDS
        .groupBy(col("DID"),col("DNom"))
        .count()
    

    counts.writeStream
      .format("update")
      .format("console")
      //.option("checkpointLocation", "/tmp/stateful")
      .outputMode("complete")
      .option("truncate",false)
      .option("newRows",30)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


#### Aggregations

In [None]:
// Aggregations

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException

import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")
    
    val counts = PatientDS
        .groupBy(col("DID"),col("DNom"))
        .agg(count("*").alias("countDID"),
             sum("DID").alias("sumDID"),
             mean("DID").alias("meanDID"),
             stddev("DID").alias("stddevDID"),
             approx_count_distinct("DID").alias("distinctDID"),
             //collect_set("DID").alias("collect_setDID"),
             collect_list("DID").alias("collect_listDID")
            )
    

    counts.writeStream
      .format("update")
      .format("console")
      .outputMode("complete")
      .option("truncate",false)
      .option("newRows",30)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


#### Structured Streaming with Checkpointing

In [None]:
// Structured Streaming with Checkpointing

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999
val checkpointDir = "/tmp/streaming_checkpoint"

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")
    
    val counts = PatientDS
        .groupBy(col("DID"),col("DNom"))
        .count()
    
    counts.writeStream
      .format("update")
      .format("console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointDir)
      .outputMode("complete")
      .option("truncate",false)
      .option("newRows",30)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


#### Spark Streaming Sinks

In [None]:
// File Sink to CSV

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999
val checkpointDir = "/tmp/streaming_checkpoint"

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS.select("*")
    
    PatientDF.writeStream
      .format("csv")
      .option("path", "/tmp/streaming_output/csv")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointDir)
      .outputMode("append")
      .option("truncate",false)
      .option("newRows",30)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


In [None]:
// File Sink to Parquet

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999
val checkpointDir = "/tmp/streaming_checkpoint"

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS.select("*")
    
    PatientDF.writeStream
      .format("parquet")
      .option("path", "/tmp/streaming_output/parquet")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointDir)
      .outputMode("append")
      .option("truncate",false)
      .option("newRows",30)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}


#### foreachBatch File Sink to CSV

In [None]:
// foreachBatch File Sink to CSV

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType,DoubleType,LongType}
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}
import java.io.IOException
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.{GroupState,GroupStateTimeout,OutputMode}
import org.apache.spark.sql.DataFrame


val PatientsSchema = StructType(Array(
     StructField("NSS", StringType),
     StructField("Nom", StringType),
     StructField("DID", IntegerType),
     StructField("DNom", StringType),
     StructField("Fecha", StringType)
         )
    )

case class Patient(
    NSS: String,
    Nom: String,
    DID: Option[Long],
    DNom: String,
    Fecha: String
)

def saveToCSV = (df: DataFrame, timeStamp: Long) => {
    df.withColumn("timeStamp", date_format(current_date(),"yyyyMMdd"))
    .write.format("csv")
    .option("path", "/tmp/streaming_output/foreachBatch")
    .mode("append")
    .save()
}

val spark:SparkSession = SparkSession.builder()
    .master("local[*]")
    .appName("Hand-On-Spark3_Socket_Data_Source")
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

import spark.implicits._

val host = "localhost"
val port = 9999
val checkpointDir = "/tmp/streaming_checkpoint"

try {
    val PatientDS = spark.readStream
        .format("socket")
        .option("host",host)
        .option("port",port)
        .load()
        .select(from_json(col("value"), PatientsSchema).as("patient"))
        .selectExpr("Patient.*")
        .as[Patient]
    
    printf("\n Listening and ready... \n")
    
    val PatientDF = PatientDS.select("*")
    
    PatientDF.writeStream
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", checkpointDir)
      .outputMode("append")
      .foreachBatch(saveToCSV)
      .start()
      .awaitTermination()
    
} catch {
    case e: java.net.ConnectException => println("Error establishing connection to " + host + ":" + port)
    case e: IOException => println("IOException occurred")
    case t: Throwable => println("Error receiving data", t)
}finally {
    println("In finally block")
}
