In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import * # Import all types

# Point the executor to the same Python interpreter the driver is using
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder \
    .appName("StreamingWeather") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
from pyspark.sql.functions import col, month, dayofyear, to_date
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressionModel
# 2. Load the Pre-Trained Decision Tree Model
model_path = "hdfs://localhost:9000/models/dt_weather_model" # <-- Path to the saved DT model
loaded_dt_model = DecisionTreeRegressionModel.load(model_path)
print(f"Successfully loaded Decision Tree model from {model_path}")

Successfully loaded Decision Tree model from hdfs://localhost:9000/models/dt_weather_model


In [4]:
streaming_schema = StructType([
    StructField("ID", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("TMAX", FloatType(), True),
    StructField("TMIN", FloatType(), True),
    StructField("EVAP", FloatType(), True),       # <-- ADDED
    StructField("PRCP", FloatType(), True),       # <-- ADDED
    StructField("Latitude", FloatType(), True),
    StructField("Longitude", FloatType(), True),
    StructField("Elevation", FloatType(), True),
    StructField("TMAX_actual", FloatType(), True),
    StructField("TMIN_actual", FloatType(), True),
    StructField("TRANGE", FloatType(), True)
])

In [5]:
streaming_dir = "hdfs://localhost:9000/streaming_weather/"
df_stream = spark.readStream \
    .schema(streaming_schema) \
    .option("header", True) \
    .option("pathGlobFilter", "*.csv") \
    .csv(streaming_dir + "*")

In [6]:
def predict_with_model(batch_df, epoch_id):
    if batch_df.count() == 0:
        print(f"--- Batch {epoch_id}: No new data. ---")
        return

    print(f"--- Processing Batch {epoch_id} ---")
    
    # 5.1: Apply the EXACT SAME Feature Engineering
    features_df = batch_df \
        .withColumn("date_formatted", to_date(col("DATE"), "M/d/yyyy")) \
        .withColumn("month", month(col("date_formatted"))) \
        .withColumn("day_of_year", dayofyear(col("date_formatted")))

    assembler = VectorAssembler(
        inputCols=["Latitude", "Longitude", "Elevation", "month", "day_of_year"],
        outputCol="features"
    )
    assembled_df = assembler.transform(features_df)

    # 5.2: Use the Loaded Model to Make Predictions
    predictions = loaded_dt_model.transform(assembled_df)
    
    # 5.3: Display the results
    print("Predictions for this batch:")
    predictions.select("DATE", "ID", "TMAX_actual", "prediction").show()

# 6. Start the Streaming Query
query = df_stream.writeStream \
    .foreachBatch(predict_with_model) \
    .outputMode("append") \
    .start()

query.awaitTermination()

--- Processing Batch 0 ---
Predictions for this batch:
+---------+-----------+-----------+------------------+
|     DATE|         ID|TMAX_actual|        prediction|
+---------+-----------+-----------+------------------+
|3/17/2021|USR0000ALOS|      -11.1|-13.38461736287152|
|3/31/2012|USC00466284|       21.7| 6.134710695439354|
| 4/2/2011|USC00249187|        5.6|0.8878608325323151|
|6/12/1993|USC00104670|       20.0| 2.743734399760597|
+---------+-----------+-----------+------------------+

--- Processing Batch 1 ---
Predictions for this batch:
+----------+-----------+-----------+-------------------+
|      DATE|         ID|TMAX_actual|         prediction|
+----------+-----------+-----------+-------------------+
|12/15/2003|USR0000MPPH|      -10.0| -2.049521042065798|
| 3/14/2006|USR0000ANOR|      -11.1| -13.38461736287152|
| 2/24/2003|USR0000IRAF|       -0.6|  2.743734399760597|
|12/13/1997|USR0000CDOW|        3.3| -1.420782989934751|
| 3/15/1997|USR0000MBIG|        2.2|-0.72859144742

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Admin\miniconda3\envs\bigdata\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 