In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, from_json, window, sum, to_json, struct
from pyspark.sql.functions import slice as F_slice
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType, MapType, IntegerType, ArrayType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Batch_streams")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de_jads"
spark.conf.set('temporaryGcsBucket', bucket)

schema = StructType([
     StructField('size', StringType(), True),
     StructField('symbol_id', StringType(), True),
     StructField('time_frame', StructType([
         StructField('start', TimestampType(), True),
         StructField('end', TimestampType(), True)
     ]), True),
     StructField('count', IntegerType(), True),
     StructField('taker_side', StringType(), True),
     StructField('broker', StringType(), True),
     StructField('symbol_type', StringType(), True),
     StructField('asset', StringType(), True),
     StructField('asset_quote', StringType(), True)
     ])
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "trades_aggregated") \
        .option("startingOffsets", "latest") \
        .option("failOnDataLoss", "false") \
        .load()

def write_to_bigquery(df, batch_id):
   # Periodically saving the results
    df.write.format('bigquery') \
      .option('table', 'glass-sylph-325109.ass2.trades') \
      .mode("append") \
      .save()

trades_per_minute = df.select(col('value').cast('string')).select(from_json(col('value'), schema).alias('data')).select('data.*')
trades_per_15_minutes = trades_per_minute.drop('symbol_id')\
              .groupBy(window(col('time_frame.end'), '15 minutes'), 'taker_side', 'broker') \
              .agg(sum('size'), sum('count')) \
              .select(col('sum(size)').alias('size'), col('broker'), col('window').alias('time_frame'), col('sum(count)').alias('count'), col('taker_side'))

query = trades_per_15_minutes.writeStream \
              .outputMode('complete')\
              .trigger(processingTime='15 minutes')\
              .foreachBatch(write_to_bigquery)\
              .start()
query.awaitTermination()

In [None]:
spark.stop()