# Harnessing Weather Insights for Accurate Energy Load Forecasting

In [None]:
%pip install -r requirements.txt

### Import important libraries

In [71]:
import os
import platform
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector

from pathlib import Path

In [51]:
# Initialize via the full spark path
if platform.system() == 'Windows':
    print("Windows OS detected")
    findspark.init("C:/Spark/spark-3.5.4-bin-hadoop3") # For my local machine
else:
    findspark.init("/usr/local/spark/")

Windows OS detected


In [52]:
# Build the SparkSession
spark = SparkSession.builder \
      .master("local") \
      .appName("Linear Regression Model") \
      .config("spark.executor.memory", "1gb") \
      .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.15.0") \
      .config("spark.sql.session.timeZone", "UTC") \
      .getOrCreate()
   
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.      
sc = spark.sparkContext

### Preprocessing I: Read in Weather and Load Data

In [None]:
# Read in the data

# Folder Structure
# data
# |-- geosphere
# |   |-- YYYY
# |      |-- MM.csv
# |      |-- MM.csv
# |
# |-- transparency
# |   |-- YYYY
# |      |-- MM.xml
# |      |-- MM.xml

# Loop through the geosphere folder and read in the data

# Define the base data folder
base_path = Path("./data/geosphere")

# Collect all data frames first to optimize the union operation
dfs = []

for year_folder in base_path.iterdir():
    if year_folder.is_dir():
        for month_file in year_folder.glob("*.csv"):
            print(f"Reading in {month_file}")

            df = spark.read.csv(str(month_file), header=True, inferSchema=True)
            
            # Convert the time column (string) to a timestamp
            df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd'T'HH:mmXXX"))

            dfs.append(df)

if dfs:
    # Combine all DataFrames
    weather = dfs[0]
    for df in dfs[1:]:
        weather = weather.union(df)

    # Aggregate measurements (average from different stations)
    weather = (
        weather.groupBy("time")
        .agg(
            avg("rr").alias("avg_rr"),
            avg("tl_mittel").alias("avg_tl_mittel"),
            avg("bewm_mittel").alias("avg_bewm_mittel"),
            avg("so_h").alias("avg_so_h"),
            avg("vv_mittel").alias("avg_vv_mittel"),
        )
        .orderBy("time")
    )

    weather.show(10)
    weather.printSchema()
else:
    print("No data found")


Reading in data\geosphere\2024\01.csv
Reading in data\geosphere\2024\02.csv
Reading in data\geosphere\2024\03.csv
Reading in data\geosphere\2024\04.csv
Reading in data\geosphere\2024\05.csv
Reading in data\geosphere\2024\06.csv
Reading in data\geosphere\2024\07.csv
Reading in data\geosphere\2024\08.csv
Reading in data\geosphere\2024\09.csv
Reading in data\geosphere\2024\10.csv
Reading in data\geosphere\2024\11.csv
Reading in data\geosphere\2024\12.csv
+-------------------+-------------------+-------------------+---------------+-------------------+------------------+
|               time|             avg_rr|      avg_tl_mittel|avg_bewm_mittel|           avg_so_h|     avg_vv_mittel|
+-------------------+-------------------+-------------------+---------------+-------------------+------------------+
|2024-01-01 00:00:00|                0.2|                4.0|           66.5|               3.35|               3.3|
|2024-01-02 00:00:00|                3.8| 2.1999999999999997|           85.0

In [76]:
# Loop through the transparency folder and read in the energy data

# Define base path for transparency data
base_path = Path("./data/transparency")

# Collect DataFrames before performing union (optimization)
dfs = []

for year_folder in base_path.iterdir():
    if year_folder.is_dir():
        for month_file in year_folder.glob("*.xml"):
            print(f"Reading transparency data: {month_file}")

            # Read XML data
            df = spark.read.format('xml').option("rowTag", "GL_MarketDocument").load(str(month_file))

            # Extract and explode relevant fields
            df_filtered = df.select(
                col("TimeSeries.Period.timeInterval.start").alias("start_time"),
                col("TimeSeries.Period.timeInterval.end").alias("end_time"),
                col("TimeSeries.Period.resolution").alias("resolution"),
                explode(col("TimeSeries.Period.Point")).alias("Point")  # Flatten Points
            ).select(
                col("start_time"),
                col("end_time"),
                col("resolution"),
                col("Point.position").cast("int").alias("position"),
                col("Point.quantity").cast("double").alias("quantity")
            )

            # Convert ISO 8601 duration (e.g., "PT15M") to minutes dynamically
            df_fixed = df_filtered.withColumn(
                "interval_minutes",
                expr("CAST(SUBSTRING(resolution, 3, LENGTH(resolution) - 3) AS INT)")  # Extracts "15" from "PT15M"
            ).withColumn(
                "actual_time",
                expr("start_time + (position - 1) * interval_minutes * interval 1 minute")
            ).select(
                col("actual_time"),
                col("quantity")
            )

            # Append DataFrame to list
            dfs.append(df_fixed)

# Merge all collected DataFrames
if dfs:
    Load = dfs[0]
    for df in dfs[1:]:
        Load = Load.union(df)

    Load.show(10)
    Load.printSchema()
else:
    print("No data found.")



Reading transparency data: data\transparency\2024\01.xml
Reading transparency data: data\transparency\2024\02.xml
Reading transparency data: data\transparency\2024\03.xml
Reading transparency data: data\transparency\2024\04.xml
Reading transparency data: data\transparency\2024\05.xml
Reading transparency data: data\transparency\2024\06.xml
Reading transparency data: data\transparency\2024\07.xml
Reading transparency data: data\transparency\2024\08.xml
Reading transparency data: data\transparency\2024\09.xml
Reading transparency data: data\transparency\2024\10.xml
Reading transparency data: data\transparency\2024\11.xml
Reading transparency data: data\transparency\2024\12.xml
+-------------------+--------+
|        actual_time|quantity|
+-------------------+--------+
|2024-01-01 00:00:00|  5578.0|
|2024-01-01 00:15:00|  5511.0|
|2024-01-01 00:30:00|  5444.0|
|2024-01-01 00:45:00|  5390.0|
|2024-01-01 01:00:00|  5424.0|
|2024-01-01 01:15:00|  5350.0|
|2024-01-01 01:30:00|  5290.0|
|2024-

### Preprocessing II: Combine both Data Frames

In [85]:
if Load is not None and weather is not None:
    # Join the data into a single DataFrame
    data = Load.join(weather, Load.actual_time == weather.time, "inner").drop("time")
    
    # Rename columns for better understanding
    data = data.withColumnRenamed("actual_time", "time")
    data = data.withColumnRenamed("quantity", "load")

    data = data.withColumnRenamed("avg_rr", "rainfall")
    data = data.withColumnRenamed("avg_tl_mittel", "temperature")
    data = data.withColumnRenamed("avg_bewm_mittel", "cloudiness")
    data = data.withColumnRenamed("avg_so_h", "sunshine_duration")
    data = data.withColumnRenamed("avg_vv_mittel", "wind_speed")
    
    # Print the schema and stats
    data.describe().show()
    data.show(10)
    data.printSchema()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+
|summary|              load|         rainfall|       temperature|        cloudiness|          sunshine|        wind_speed|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+
|  count|               366|              366|               366|               366|               366|               366|
|   mean|5465.7213114754095|2.366666666666667|11.837295081967213| 61.48497267759563|5.1245901639344265|2.3968579234972673|
| stddev| 662.4324448860633|7.928408835741675| 7.767922876423139|23.567110083132476|3.6014679992167693|1.0763445069155058|
|    min|            4386.0|             -1.0|              -6.1|               0.0|               0.0|               0.5|
|    max|            7270.0|82.69999999999999|              26.5|             100.0|             13.55|               8.3|
+-------+-------

### Machine Learning

In [None]:
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))