In [None]:
val stockTicksDf = spark.readStream
                    .format("kafka")
                    .option("kafka.bootstrap.servers","hadoop-vm:9092")
                    .option("subscribe", "stock-ticks")
                    .option("group-id", "stock-ticks-groupAk-hdfs")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .load()
stockTicksDf.printSchema(2)

In [None]:

val ticksDf = stockTicksDf.selectExpr("CAST(value AS STRING)", "timestamp")
ticksDf.printSchema() 

In [None]:
import org.apache.spark.sql.types.{StringType, StructType, DoubleType, 
                                   IntegerType, LongType, StructField }

val stockTicksschema = StructType( List(
    StructField("symbol", StringType, true),
    StructField("price", DoubleType, true),
    StructField("volume", LongType, true),
    StructField("timestamp", LongType, true)
))

In [None]:

val jsonDf = ticksDf.withColumn("value", from_json($"value", stockTicksschema))
jsonDf.printSchema()

In [None]:

val stockTickDf = jsonDf.select (col("value.*")) 
stockTickDf.printSchema()

In [None]:

val stockTickDf1  = stockTickDf 
                .withColumn("timestampTemp", (col("timestamp") / 1000).cast("timestamp"))
                .withColumn("trade_time", date_trunc("minute", col("timestampTemp")))
                .drop("timestamp")
                .drop("timestampTemp")
                .withColumnRenamed("trade_time", "timestamp")
              

In [None]:
val stockTickDf1Min = stockTickDf\
                            .selectExpr("timestamp", "symbol",  "struct(*) as value")\
                            .withWatermark("timestamp", "1 minutes")\
                            .groupBy("symbol", F.window("timestamp", "60 seconds"))\
                            .agg( F.collect_list("value").alias("ticks"))

stockTickDf1Min.printSchema()

In [None]:

// import org.apache.spark.sql.functions.{date_format}
// val stockTickDf2 = stockTickDf1.withColumn("Year", date_format($"timestamp", "yyyy"))
//                      .withColumn("Month", date_format($"timestamp", "MM"))
//                      .withColumn("Day", date_format($"timestamp", "dd"))
//                      .withColumn("Hours",date_format($"timestamp","hh"))
//                      .withColumn("Minutes",date_format($"timestamp","mm"))


In [None]:
import org.apache.spark.sql._
def processBatchData(candleBatchDf:DataFrame, batch_id:Long)={
    print ("process batch called", batch_id, "writing ", candleBatchDf.count())
   
import org.apache.spark.sql.functions.{explode,col}
 
var candleTickBatchDf =  (candleBatchDf
        .coalesce(1)
        .write
        .partitionBy("Year", "Month", "Day" , "Hours", "Minutes", "symbol")
        .mode("append")
        .format("csv")
        .option("header", true)
         .save("hdfs://localhost:9000/stock-ticks-scala12")

    )
}
       
stockTickDf2.writeStream.
foreachBatch(processBatchData _).outputMode("append").start()
    


In [None]:
//import pyspark.sql.functions as F
import org.apache.spark.sql.streaming.Trigger


    stockTickDf2
     .withColumn("year", date_format(col("timestamp"), "yyyy"))
     .withColumn("month", date_format(col("timestamp"), "MM"))
     .withColumn("day", date_format(col("timestamp"), "dd"))  
     .withColumn("hour", date_format(col("timestamp"), "HH"))   
     .withColumn("_symbol", col("symbol"))   
    .writeStream
    .trigger(Trigger.ProcessingTime("2 seconds"))
    .queryName("Write Ticks to CSV trigger by 1 min hour")
    .format("csv")
    .option("path", "hdfs://localhost:9000/dump-csv-trigger-hourly-1min/scala")
    .option("header", true)
    .option("checkpointLocation", "hdfs://localhost:9000/checkpoint/tickscsvtohdfs6/scala")
    .partitionBy("year", "month", "day", "hour", "_symbol")
    .option("truncate", false)
    .start()
