In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from time import sleep
import json

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkStreamLab1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# {"year":2008,"month":4,"day":1,"order":3,"country":29,"session ID":140,"page 1 (main category)":4,
# "page 2 (clothing model)":"P8","colour":2,"location":3,"model photography":1,"price":28,"price 2":2,"page":1}
schema = StructType([
        StructField("year", IntegerType()),
        StructField("month", IntegerType()),
        StructField("day", IntegerType()),
        StructField("order", IntegerType()),
        StructField("country", IntegerType()),
        StructField("session ID", IntegerType()),
        StructField("page 1 (main category)", IntegerType()),
        StructField("page 2 (clothing model)", StringType()),
        StructField("colour", IntegerType()),
        StructField("location", IntegerType()),
        StructField("model photography", IntegerType()),
        StructField("price", IntegerType()),
        StructField("price 2", IntegerType()),
        StructField("page", IntegerType()), 
        ])


# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "34.79.25.102:9092").option("failOnDataLoss", "false") \
        .option("subscribe", "sales") \
        .option("startingOffsets", "earliest") \
        .load()

df.printSchema()

order_df = df.selectExpr("CAST(value AS STRING)", "timestamp")

order_df2 = order_df \
        .select(from_json(col("value"), schema)\
        .alias("orders"), "timestamp")

all_orders = order_df2.select("orders.*", "timestamp")
    
order_wrangle = all_orders.groupBy("month", "country") \
    .avg("order") \
    .agg(sum("order")) \
    .select("month", "country", col("order"), col("sum(order)"))

order_wrangle.printSchema()

print("Printing Schema of order_wrangle: ")
orders_wrangle.show()

query = order_wrangle \
        .writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("update") \
        .option("truncate", "false")\
        .format("console") \
        .start()

"""
gcs_bucket = "GCSBUCK"
query = order_wrangle \
        .writeStream \
        .format("bigquery") \
        .trigger(processingTime='5 seconds') \
        .outputMode("update") \
        .option("truncate", "false")\
        .option("temporaryGcsBucket", gcs_bucket)
        .option("checkpointLocation", "notebooks") \
        .mode("append") \
        .save()
"""

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
except:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Unexpected error")
    print("Stoped the streaming query and the spark context")

In [38]:
# Stop the spark context
spark.stop()