# Harnessing Weather Insights for Accurate Energy Load Forecasting

In [22]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [23]:
import os

In [19]:
import platform

# Import the findspark module and use it to initialize the PySpark environment 
import findspark

# Initialize via the full spark path
if platform.system() == 'Windows':
    print("Windows OS detected")
    findspark.init("C:/Spark/spark-3.5.4-bin-hadoop3") # For my local machine
else:
    findspark.init("/usr/local/spark/")


Windows OS detected


In [20]:
# Import the SparkSession module
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.      
sc = spark.sparkContext

In [29]:
# Read in the data

# Folder Structure
# data
# |-- geosphere
# |   |-- YYYY-MM-DD_YYYY-MM-DD.csv
# |   |-- YYYY-MM-DD_YYYY-MM-DD.csv
# |
# |-- transparency
# |   |-- YYYY
# |      |-- MM.csv
# |      |-- MM.csv

# Loop through the geosphere folder and read in the data

weather = None

for filename in os.listdir("./data/geosphere"):
    file_path = os.path.join("./data/geosphere", filename)
    if os.path.isfile(file_path):
        print(file_path)
        
        # Read in the data
        weather = spark.read.csv("./data/geosphere/2024-01-01_2024-12-31.csv", header=True, inferSchema=True)
        
        # Combine the data
        weather = weather.union(weather)


# Print the schema of the DataFrame
weather.printSchema()

print(weather.show())

print(weather.count())

./data/geosphere\2024-01-01_2024-12-31 copy.csv
./data/geosphere\2024-01-01_2024-12-31.csv
root
 |-- time: string (nullable = true)
 |-- station: integer (nullable = true)
 |-- rr: double (nullable = true)
 |-- tl_mittel: double (nullable = true)
 |-- bewm_mittel: double (nullable = true)
 |-- so_h: double (nullable = true)
 |-- vv_mittel: double (nullable = true)
 |-- substation: integer (nullable = true)

+--------------------+-------+----+---------+-----------+----+---------+----------+
|                time|station|  rr|tl_mittel|bewm_mittel|so_h|vv_mittel|substation|
+--------------------+-------+----+---------+-----------+----+---------+----------+
|2024-01-01T00:00+...|      1| 0.0|      2.2|       80.0| 2.7|      2.1|     10200|
|2024-01-01T00:00+...|    105| 0.4|      5.8|       53.0| 4.0|      4.5|      5904|
|2024-01-02T00:00+...|      1| 0.1|     -0.4|       73.0| 1.9|      0.6|     10200|
|2024-01-02T00:00+...|    105| 7.5|      4.8|       97.0| 0.0|      2.1|      5904|
|