In [7]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_6")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("Arrival_Time", LongType(), True),
     StructField("Creation_Time", LongType(), True),
     StructField("Device", StringType(), True),
     StructField("Index", LongType(), True),
     StructField("Model", StringType(), True),
     StructField("User", StringType(), True),
     StructField("gt", StringType(), True),
     StructField("x", DoubleType(), True),
     StructField("y", DoubleType(), True),
     StructField("z", DoubleType(), True)
     ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
    .json("/home/jovyan/data/activity-data")

# create the event time column 
withEventTimedf = sdf.selectExpr(
    "*",
    "cast(cast(Creation_Time as double)/1000000000 as timestamp) as event_time")

withEventTimedf.printSchema()

withEventTimedf.groupBy(window(col("event_time"), "10 minutes", "5 minutes")).count() \
        .writeStream \
        .queryName("activity_events_per_window") \
        .format("memory") \
        .outputMode("complete") \
        .start()

for x in range(10):
    spark.sql("SELECT * FROM activity_events_per_window").show()
    sleep(10)

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- event_time: timestamp (nullable = true)

+------+-----+
|window|count|
+------+-----+
+------+-----+

+------+-----+
|window|count|
+------+-----+
+------+-----+

+------+-----+
|window|count|
+------+-----+
+------+-----+

+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2015-02-23 12:55...| 1447|
|{2015-02-23 10:10...|  111|
|{2015-02-24 12:55...| 2506|
|{2015-02-23 14:05...| 1309|
|{2015-02-24 13:35...| 2182|
|{2015-02-24 13:00...| 1727|
|{2015-02-24 14:05...| 1582|
|{2015-02-23 11:10...| 1138|
|{2015-02-23 13:55...| 2488|
|{2015-02-24 12:50...| 2666|
|{2015-02-24 13:10...| 12

In [6]:
# Stop the spark context
spark.stop()