In [None]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import * # Import all types

# Point the executor to the same Python interpreter the driver is using
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql.functions import input_file_name

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType




spark = SparkSession.builder \
    .appName("StreamingWeather") \
    .master("local[*]") \
    .getOrCreate()

schema = StructType([
    StructField("STATION", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
    StructField("ELEVATION", DoubleType(), True),
    StructField("NAME", StringType(), True),
    StructField("TEMP", DoubleType(), True),
    StructField("TEMP_ATTRIBUTES", StringType(), True),
    StructField("DEWP", DoubleType(), True),
    StructField("DEWP_ATTRIBUTES", StringType(), True),
    StructField("SLP", DoubleType(), True),
    StructField("SLP_ATTRIBUTES", StringType(), True),
    StructField("STP", DoubleType(), True),
    StructField("STP_ATTRIBUTES", StringType(), True),
    StructField("VISIB", DoubleType(), True),
    StructField("VISIB_ATTRIBUTES", StringType(), True),
    StructField("WDSP", DoubleType(), True),
    StructField("WDSP_ATTRIBUTES", StringType(), True),
    StructField("MXSPD", DoubleType(), True),
    StructField("GUST", DoubleType(), True),
    StructField("MAX", DoubleType(), True),
    StructField("MAX_ATTRIBUTES", StringType(), True),
    StructField("MIN", DoubleType(), True),
    StructField("MIN_ATTRIBUTES", StringType(), True),
    StructField("PRCP", DoubleType(), True),
    StructField("PRCP_ATTRIBUTES", StringType(), True),
    StructField("SNDP", DoubleType(), True),
    StructField("FRSHTT", IntegerType(), True)
])

# Read the stream using the glob path
df_stream = spark.readStream \
    .schema(schema) \
    .option("header", True) \
    .option("pathGlobFilter", "*.csv") \
    .csv("hdfs://localhost:9000/streaming_weather/*")

# --- 2. Define Alerting Logic ---
alerts_df = df_stream.filter(
    (df_stream["TEMP"] > 95.0) |
    (df_stream["GUST"] > 30.0) |
    (df_stream["PRCP"] > 0.5)
)

# --- 3. Create a Function to Process and Print Alerts ---
def process_alerts(batch_df, epoch_id):
    # Check if the batch DataFrame is empty. batch_df will only have data that matches our filter.
    if batch_df.count() > 0:
        print(f"--- Batch {epoch_id}: 🚨 SEVERE WEATHER ALERTS DETECTED! 🚨 ---")
        
        # Collect the alerts and print the details
        alerts_to_send = batch_df.select("NAME", "DATE", "TEMP", "GUST", "PRCP").collect()
        for alert in alerts_to_send:
            print(f"  -> ALERT at {alert['NAME']} on {alert['DATE']}: Temp={alert['TEMP']}°F, Gust={alert['GUST']}mph, Rain={alert['PRCP']}in")
    else:
        # This will print for any batch that has no alerts
        print(f"--- Batch {epoch_id}: No alerts. Conditions normal. ---")

# --- 4. Start the Streaming Query ---
query = alerts_df.writeStream \
    .foreachBatch(process_alerts) \
    .outputMode("append") \
    .start()

query.awaitTermination()




--- Batch 0: 🚨 SEVERE WEATHER ALERTS DETECTED! 🚨 ---
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-03: Temp=64.0°F, Gust=999.9mph, Rain=0.0in
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-06: Temp=68.0°F, Gust=999.9mph, Rain=0.0in
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-01: Temp=62.6°F, Gust=999.9mph, Rain=0.0in
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-05: Temp=65.4°F, Gust=999.9mph, Rain=0.0in
--- Batch 1: 🚨 SEVERE WEATHER ALERTS DETECTED! 🚨 ---
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-07: Temp=63.7°F, Gust=68.0mph, Rain=0.0in
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-08: Temp=62.8°F, Gust=40.0mph, Rain=0.0in
--- Batch 2: 🚨 SEVERE WEATHER ALERTS DETECTED! 🚨 ---
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-11: Temp=63.6°F, Gust=42.9mph, Rain=0.0in
  -> ALERT at LOS ANGELES WHITEMAN AIRPORT, CA US on 2025-01-12: Temp=62.9°F, Gust=40.0mph, Rain=0.0in
  -> ALERT at

In [None]:
# In a new cell

# You can run this cell over and over to see the latest data!
spark.sql("SELECT * FROM live_weather_updates").show(truncate=False)