In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType, TimestampType, StructField, ArrayType
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization

with open("rsa_key_w.p8", "rb") as key_file:
    p_key = serialization.load_pem_private_key(
        key_file.read(),
        password="IF4044BigData".encode(),
        backend=default_backend()
    )

pkb = p_key.private_bytes(
    encoding=serialization.Encoding.PEM,
    format=serialization.PrivateFormat.PKCS8,
    encryption_algorithm=serialization.NoEncryption()
)
 
pkb = pkb.decode("UTF-8")

pkb = re.sub("-*(BEGIN|END) PRIVATE KEY-*\n","", pkb).replace("\n","")

credentials = {
    'user': 'WILLYWILSEN',
    'password': 'IF4044BigData',
    'account': 'trebcba-op98541',
    'warehouse': 'COMPUTE_WH',
    'database': 'UK_FLOODS',
    'schema': 'PUBLIC',
    'role': 'ACCOUNTADMIN',
    'table': 'FLOOD_WARNING_MONITORING'
}

# Initialize SparkSess
spark = SparkSession.builder \
    .appName("UKFloodsStructuredStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,net.snowflake:snowflake-jdbc:3.16.0,net.snowflake:spark-snowflake_2.12:2.15.0-spark_3.4") \
    .getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", 3)  

# Define schema for nested field floodArea
flood_area_schema = StructType([
    StructField("@id", StringType()),
    StructField("county", StringType()),
    StructField("notation", StringType()),
    StructField("polygon", StringType()),
    StructField("riverOrSea", StringType(), True)  # Optional field, set nullable=True
])

# Define schema for JSON data including nested field floodArea
schema = StructType([
    StructField("@id", StringType()),
    StructField("description", StringType()),
    StructField("eaAreaName", StringType()),
    StructField("eaRegionName", StringType()),
    StructField("floodArea", flood_area_schema),
    StructField("floodAreaID", StringType()),
    StructField("isTidal", BooleanType()),
    StructField("message", StringType()),
    StructField("severity", StringType()),
    StructField("severityLevel", IntegerType()),
    StructField("timeMessageChanged", TimestampType()),
    StructField("timeRaised", TimestampType()),
    StructField("timeSeverityChanged", TimestampType())
])

# Read data from Kafka
kafkaStreamDF = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "uk-flood") \
    .load()

# Parse JSON data
parsedDF = kafkaStreamDF \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), ArrayType(schema)).alias("data")) \
    .selectExpr("explode(data) AS items") \
    .select("items.*")

# Perform aggregation
current_timestamp = F.current_timestamp()
resultDF = parsedDF.withColumn("currentTimestamp", current_timestamp) \
    .withWatermark("currentTimestamp", "10 seconds") \
    .withColumnRenamed("eaAreaName", "FLOOD_AREA") \
    .withColumnRenamed("severityLevel", "SEVERITY_LEVEL") \
    .groupBy(F.window("currentTimestamp", "10 seconds"), "FLOOD_AREA", "SEVERITY_LEVEL").count() \
    .withColumnRenamed("window", "FLOOD_WARNING_TIMESTAMP") \
    .withColumnRenamed("count", "SEVERITY_COUNT")

# Write the parsed data to console
query = resultDF \
    .writeStream \
    .format("net.snowflake.spark.snowflake") \
    .option("sfUser", credentials['user']) \
    .option("sfPassword", credentials['password']) \
    .option("sfURL", f"https://{credentials['account']}.snowflakecomputing.com") \
    .option("sfDatabase", credentials['database']) \
    .option("sfSchema", credentials['schema']) \
    .option("sfWarehouse", credentials['warehouse']) \
    .option("dbtable", credentials['table']) \
    .option("sfRole", credentials['role']) \
    .option("pem_private_key", pkb) \
    .option("streaming_stage", "SEVERITY_STAGE") \
    .option("checkpointLocation", "/tmp/checkpoint") \
    .outputMode("append") \
    .start()

# Start the streaming query
query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = d7a4f92b-30de-49e3-a340-22a5d8140bb4, runId = c9ce1e74-94a7-4ffe-b4dd-91c82105fe5b] terminated with exception: Internal error: fileExists() should not be called for GCS