In [5]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import window, col, avg
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, ShortType, FloatType, ByteType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment2_Stream_memory")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
        [StructField("ArrDelay", FloatType(), True),
         StructField("ArrTime", FloatType(), True),
         StructField("DepDelay", FloatType(), True),
         StructField("Dest", StringType(), True),
         StructField("Timestamp", LongType(), True)            
         ])

#Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
         .csv("/home/jovyan/data/airline/")

# create the event time column 
newsdf = sdf.selectExpr(
    "*",
    "cast(Timestamp as timestamp) as event_time") 

newsdf.printSchema()


result = newsdf.groupBy(window(col("event_time"), "10 seconds"),"ArrTime", "Dest") \
    .avg("ArrDelay", "DepDelay").withColumnRenamed("avg(ArrDelay)", "avg_ArrDelay").withColumnRenamed("avg(DepDelay)", "avg_DepDelay")


# https://sparkbyexamples.com/pyspark/pyspark-column-alias-after-groupby/

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

bucket = "airplane_chris_ass2"    
spark.conf.set('temporaryGcsBucket', bucket)

# result.writeStream.format('bigquery') \
#   .option('table', 'prefab-clover-330908.airline_2.stream_chris_ass') \
#   .outputMode("complete") \


def my_foreach_batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'prefab-clover-330908.airline_2.streaming_avg') \
      .mode("overwrite") \
      .save()

airlineQuery = result.writeStream.outputMode("complete") \
                    .trigger(processingTime = '2 seconds').foreachBatch(my_foreach_batch_function).start()

try:
    airlineQuery.awaitTermination()
except KeyboardInterrupt:
    airlineQuery.stop()
    # Stop the spark context
    spark.stop()
    print("Stopped the streaming query and the spark context")

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|10452|
|       sit|12310|
|     stand|11385|
|      walk|13256|
|      bike|10797|
|stairsdown| 9365|
|      null|10447|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|20905|
|       sit|24620|
|     stand|22770|
|      walk|26512|
|      bike|21594|
|stairsdown|18729|
|      null|20894|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|31357|
|       sit|36929|
|     stand|34154|
|      walk|39768|
|      bike|32390|
|stairsdown|28094|
|      null|31343|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|31357|
|       sit|36929|
|     stand|34154|
|      walk|39768|
|      bike|32390|
|stairsdown|28094|
|  

In [6]:
# Stop the spark context
spark.stop()