In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_8")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
static = spark.read.json("/home/jovyan/data/activity-data")
dataSchema = static.schema
# Read the dataset as a stream
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .json("/home/jovyan/data/activity-data")
    
historicalAgg = static.groupBy("gt", "model").avg()
# Combine the histotical dataset and the streaming data set
deviceModelStats = sdf.drop("Arrival_Time", "Creation_Time", "Index") \
        .cube("gt", "model").avg() \
        .join(historicalAgg, ["gt", "model"]) \
        .writeStream.queryName("device_counts").format("memory") \
        .outputMode("complete") \
        .start()

for x in range(10):
    spark.sql("SELECT * FROM device_counts").show()
    sleep(10)

+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|avg(Arrival_Time)|avg(Creation_Time)|avg(Index)|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+

+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|avg(Arrival_Time)|avg(Creation_Time)|avg(Index)|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+

+---+-----+------+------+------+-----------------+------------------+----------+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|avg(Arrival_Time)|avg(Creation_Time)|avg(Index)|

In [4]:
# Stop the spark context
spark.stop()