# Harnessing Weather Insights for Accurate Energy Load Forecasting

In [None]:
%pip install -r requirements.txt

### Import important libraries

In [14]:
import os
import platform
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
# Initialize via the full spark path
if platform.system() == 'Windows':
    print("Windows OS detected")
    findspark.init("C:/Spark/spark-3.5.4-bin-hadoop3") # For my local machine
else:
    findspark.init("/usr/local/spark/")

In [16]:
# Build the SparkSession
spark = SparkSession.builder \
      .master("local") \
      .appName("Linear Regression Model") \
      .config("spark.executor.memory", "1gb") \
      .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.15.0") \
      .getOrCreate()
   
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.      
sc = spark.sparkContext

### Preprocessing I: Read Weather Data

In [None]:
# Read in the data

# Folder Structure
# data
# |-- geosphere
# |   |-- YYYY
# |      |-- MM.csv
# |      |-- MM.csv
# |
# |-- transparency
# |   |-- YYYY
# |      |-- MM.xml
# |      |-- MM.xml

# Loop through the geosphere folder and read in the data

weather = None

for year_folder in os.listdir("./data/geosphere/"):
    year_path = os.path.join("./data/geosphere/", year_folder)
    
    if os.path.isdir(year_path):
        for month_file in os.listdir(year_path):
            if month_file.endswith(".csv"):
                filepath = os.path.join(year_path, month_file)

                df = spark.read.csv(filepath, header=True, inferSchema=True)
                
                # Combine the data
                if weather is None:
                    weather = df
                else:
                    weather = weather.union(df)

if weather is not None:
    weather.show(100)
    weather.printSchema()
else:
    print("No data found")

### Preprocessing II: Read Energy Data

In [None]:
# Loop through the transparency folder and read in the energy data

Load = None  # Initialize empty DataFrame

# Loop through transparency data folder
for year_folder in os.listdir("./data/transparency/"):
    year_path = os.path.join("./data/transparency/", year_folder)

    if os.path.isdir(year_path):
        for month_file in os.listdir(year_path):
            if month_file.endswith(".xml"):
                filepath = os.path.join(year_path, month_file)
                print(f"Reading transparency data: {filepath}")

                # Read XML data
                df = spark.read.format('xml').option("rowTag", "GL_MarketDocument").load(filepath)

                # Extract and explode data
                df_filtered = df.select(
                    col("TimeSeries.Period.timeInterval.start").alias("start_time"),
                    col("TimeSeries.Period.timeInterval.end").alias("end_time"),
                    col("TimeSeries.Period.resolution").alias("resolution"),
                    explode(col("TimeSeries.Period.Point")).alias("Point")  # Flatten Points
                ).select(
                    col("start_time"),
                    col("end_time"),
                    col("resolution"),
                    col("Point.position").cast("int").alias("position"),
                    col("Point.quantity").cast("double").alias("quantity")
                )

                # Use resolution dynamically (assuming all values follow ISO 8601 duration format)
                df_fixed = df_filtered.withColumn(
                    "actual_time",
                    expr("start_time + (position - 1) * interval 15 minutes")  # Change to dynamic interval if needed
                ).select(
                    col("actual_time"),
                    col("position"),
                    col("quantity")
                )
                
                # Append to Load DataFrame
                if Load is None:
                    Load = df_fixed
                else:
                    Load = Load.union(df_fixed)

# Show final merged DataFrame
if Load is not None:
    Load.show(1000)
    Load.printSchema()
else:
    print("No data found.")
