In [None]:
# WINDOW COUNT - TO BQ

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, ShortType, DateType
from time import sleep

# Set up Spark session
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("count_window")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()


# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2024_trs"
spark.conf.set('temporaryGcsBucket', bucket)


####################
# Get data
## Create schema
dataSchema = StructType(
    [StructField("city", StringType(), True),
     StructField("datetime", DateType(), True)
])

## Read from a source - File: https://raw.githubusercontent.com/DeskThom/DE2024_assignment2/refs/heads/main/stream/data/outage_data_20241127_121735.json
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .json("/home/jovyan/data/stream-data")

sdf.printSchema()

##################

# Count all outages per window. Optionally, cities can be seperated.
outagesPerWindow = sdf.groupBy(window(col("datetime"), "1 day")).count()


def foreach_batch_function_windowed(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'data-engineering-435508.labdataset.outage_per_window') \
      .mode("overwrite") \
      .save()

# Write to a sink - here, the output is written to a Big Query Table
# Use your gcp bucket name. 
# ProcessingTime trigger with two-seconds micro-batch interval
activityQuery = outagesPerWindow.writeStream.outputMode("complete") \
                    .trigger(processingTime = '2 seconds').foreachBatch(foreach_batch_function_windowed).start()

try:
    activityQuery.awaitTermination()
except KeyboardInterrupt:
    activityQuery.stop()
    # Stop the spark context
    spark.stop()
    print("Stopped the streaming query and the spark context")
