In [5]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, ShortType, FloatType, ByteType, IntegerType
from time import sleep
# https://spark.apache.org/docs/1.4.0/api/java/org/apache/spark/sql/types/DataTypes.html
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
        [StructField("ArrDelay", FloatType(), True),
         StructField("ArrTime", FloatType(), True),
         StructField("DepDelay", FloatType(), True),
         StructField("Dest", StringType(), True),
         StructField("Timestamp", LongType(), True)            
         ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .csv("/home/jovyan/data/airline/2008_stream_till_8_2e.csv")

# create the event time column 
newsdf = sdf.selectExpr(
    "*",
    "cast(Timestamp / 10.0 as timestamp) as event_time")
# or "cast(cast(Timestamp as double) / 1000 as timestamp) as event_time")?
newsdf.printSchema()


result = newsdf.groupBy(window(col("event_time"), "10 seconds"),"ArrTime", "Dest").avg("ArrDelay", "DepDelay") \
        .writeStream \
        .queryName("avg_arr_dep_delay_per_dest") \
        .format("memory") \
        .outputMode("complete") \
        .start()



try:
    for x in range(10):
        spark.sql("SELECT * FROM avg_arr_dep_delay_per_dest").show()
        sleep(10)
except KeyboardInterrupt:
        result.stop()
        # stop the Spark context
        print("Stopped the datastream")
    

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|10452|
|       sit|12310|
|     stand|11385|
|      walk|13256|
|      bike|10797|
|stairsdown| 9365|
|      null|10447|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|20905|
|       sit|24620|
|     stand|22770|
|      walk|26512|
|      bike|21594|
|stairsdown|18729|
|      null|20894|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|31357|
|       sit|36929|
|     stand|34154|
|      walk|39768|
|      bike|32390|
|stairsdown|28094|
|      null|31343|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|31357|
|       sit|36929|
|     stand|34154|
|      walk|39768|
|      bike|32390|
|stairsdown|28094|
|  

In [6]:
# Stop the spark context
spark.stop()