In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import window, col, avg
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, ShortType, FloatType, ByteType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
        [StructField("ArrDelay", FloatType(), True),
         StructField("ArrTime", FloatType(), True),
         StructField("DepDelay", FloatType(), True),
         StructField("Dest", StringType(), True),
         StructField("Timestamp", LongType(), True)            
         ])

#Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
         .csv("/home/jovyan/data/airline/")

# create the event time column 
newsdf = sdf.selectExpr(
    "*",
    "cast(Timestamp as timestamp) as event_time") 
#    "cast(cast(Timestamp as double) * 60 as timestamp) as event_time")
    
newsdf.printSchema()


# result = newsdf.groupBy(window(col("event_time"), "10 seconds"),"ArrTime", "Dest").avg("ArrDelay", "DepDelay") \
#         .writeStream \
#         .queryName("avg_arr_dep_delay_per_dest") \
#         .format("memory") \
#         .outputMode("complete") \
#         .start()

result = newsdf.groupBy(window(col("event_time"), "10 seconds"),"ArrTime", "Dest").avg("ArrDelay", "DepDelay")


# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

bucket = "airplane_chris_ass2"    
spark.conf.set('temporaryGcsBucket', bucket)

# result.writeStream.format('bigquery') \
#   .option('table', 'prefab-clover-330908.airline_2.stream_chris_ass') \
#   .outputMode("complete") \


def my_foreach_batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'prefab-clover-330908.airline_2.streaming_avg') \
      .mode("overwrite") \
      .save()

airlineQuery = result.writeStream.outputMode("complete") \
                    .trigger(processingTime = '2 seconds').foreachBatch(my_foreach_batch_function).start()




try:
    airlineQuery.awaitTermination()
except KeyboardInterrupt:
    airlineQuery.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
# try:
#     for x in range(10):
#         spark.sql("SELECT * FROM avg_arr_dep_delay_per_dest").show()
#         sleep(10)
# except KeyboardInterrupt:
#         result.stop()
#         # stop the Spark context
#         print("Stopped the datastream")
#result.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[C3 AF C2 BB C2 B...| word|        0|     0|2021-10-31 10:02:...|            0|
|null|                [0A]| word|        0|     1|2021-10-31 10:02:...|            0|
|null|[4D 61 6E 79 20 6...| word|        0|     2|2021-10-31 10:02:...|            0|
|null|                [0A]| word|        0|     3|2021-10-31 10:02:...|            0|
|null|[49 6E 20 61 20 6...| word|        0|     4|2021-10-31 10:02:...|            0|
|null|                [0A]| w

In [None]:
# Stop the spark context
spark.stop()