In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, from_json, window
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType, MapType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Batch_process")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch

schema = StructType([
     StructField('type', StringType(), True),
     StructField('symbol_id', StringType(), True),
     StructField('sequence', StringType(), True),
     StructField('time_exchange', TimestampType(), True),
     StructField('time_coinapi', TimestampType(), True),
     StructField('uuid', StringType(), True),
     StructField('price', DoubleType(), True),
     StructField('size', DoubleType(), True),
     StructField('taker_side', StringType(), True),

     ])

df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "trade") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", "false") \
        .load()

df.printSchema()
trades = df.select(col('value').cast('string')).select(from_json(col('value'), schema).alias('data')).select('data.*')
trades.printSchema()


query = trades.withWatermark('time_exchange', '30 seconds') \
              .groupBy(window(col('time_exchange'), '15 minutes', '10 seconds')) \
              .count() \
              .select(concat(col('window').cast('string'), lit(' '), col('count').cast('string')).alias('value')) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "kafka1:9093") \
              .option("checkpointLocation", "/home/jovyan/checkpoint")\
              .option("topic", "trade_echo") \
              .outputMode('complete') \
              .start()
query.awaitTermination()

In [None]:
spark.stop()