In [1]:
val kafkaDf = spark.readStream
                    .format("kafka")
                    .option("kafka.bootstrap.servers", "hadoop-vm:9092")
                    .option("subscribe", "stock-ticks")
                    .option("group.id", "stock-ticks-group23")
                    .option("header",true)
                    .option("inferschema",true)
                    .option("delimitter", ",")
                    .load()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4040
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1648322907302)
SparkSession available as 'spark'


kafkaDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [2]:

kafkaDf.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:

val ticksDf = kafkaDf.selectExpr("CAST(value AS STRING)", "timestamp")
ticksDf.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



ticksDf: org.apache.spark.sql.DataFrame = [value: string, timestamp: timestamp]


In [7]:
import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}

val stockTickschema = StructType( List(
    StructField("symbol", StringType, true),
    StructField("price", DoubleType, true),
    StructField("volume", LongType, true),
    StructField("timestamp", LongType,  true)
))

import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}
stockTickschema: org.apache.spark.sql.types.StructType = StructType(StructField(symbol,StringType,true), StructField(price,DoubleType,true), StructField(volume,LongType,true), StructField(timestamp,LongType,true))


In [10]:

val jsonDf = ticksDf.withColumn("value", from_json($"value", stockTickschema))
jsonDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- symbol: string (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- volume: long (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)



jsonDf: org.apache.spark.sql.DataFrame = [value: struct<symbol: string, price: double ... 2 more fields>, timestamp: timestamp]


In [11]:

val stockTickDf = jsonDf.select (col("value.*")) 
stockTickDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- timestamp: long (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 2 more fields]


In [13]:

val stockTickDf1 = stockTickDf
                .withColumn("timestamp", (col("timestamp") / 1000).cast("timestamp"))
                

stockTickDf.printSchema()


root
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- timestamp: long (nullable = true)



stockTickDf1: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 2 more fields]


In [31]:
import org.apache.spark.sql.functions.{col}
val stockTickDf1Min = stockTickDf1
                            .selectExpr("timestamp", "symbol",  "struct(*) as value")
                            .withWatermark("timestamp", "1 minutes")
                            .groupBy(col("symbol"),window(col("timestamp"), "60 seconds"))
                          .agg(collect_list($"value").alias("ticks"))

stockTickDf1Min.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- ticks: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- symbol: string (nullable = true)
 |    |    |-- price: double (nullable = true)
 |    |    |-- volume: long (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)



import org.apache.spark.sql.functions.col
stockTickDf1Min: org.apache.spark.sql.DataFrame = [symbol: string, window: struct<start: timestamp, end: timestamp> ... 1 more field]


In [40]:
import org.apache.spark.sql.functions.{explode,col}
import org.apache.spark.sql._
def processBatchData(candleBatchDf:DataFrame, batch_id:Long)={
  

    print ("process batch called", batch_id, "writing ", candleBatchDf.count())
   
    var candleTickBatchDf = (candleBatchDf.select(col("window"), explode(column("ticks")) )
                                      .select(col("window.*"), col("col.*"))
                        )
}   
    

     
    
stockTickDf1Min.writeStream.foreachBatch(processBatchData _).outputMode("append").start()
    


import org.apache.spark.sql.functions.{explode, col}
import org.apache.spark.sql._
processBatchData: (candleBatchDf: org.apache.spark.sql.DataFrame, batch_id: Long)Unit
res29: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5738f5f


In [42]:


val stockTickDf1MinKafka = stockTickDf1Min
                            .selectExpr("to_json(struct(*)) AS value")

stockTickDf1MinKafka
            .writeStream
            .format("kafka")
            .outputMode("update")
            .option("kafka.bootstrap.servers", "localhost:9092")
            .option("topic", "candles-1min")
            .option("checkpointLocation", "file:///tmp/spark3")
            .start()

stockTickDf1MinKafka: org.apache.spark.sql.DataFrame = [value: string]
res30: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7327ae4a
