In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

import statsmodels.api as sm



In [2]:
# Load Data

daily_data = pd.read_csv("merged_data_granualityBased/daily_merged_data.csv")
monthly_data = pd.read_csv("merged_data_granualityBased/merged_monthyl_data.csv")
yearly_data = pd.read_csv("merged_data_granualityBased/merged_Yearly_data.csv")


In [3]:
# Convert datetime columns
daily_data["date"] = pd.to_datetime(daily_data["date"])

daily_data.set_index("date", inplace=True)

In [4]:
# Ensure numerical columns are correctly formatted
yearly_data["Population"] = yearly_data["Population"].str.replace(',', '').astype(float)

In [5]:
# Define the target variable
target_variable = 'loadConsumption'

# Define exogenous variables (features affecting energy demand)
exog_variables = ['Population', 'TotalSolarEnergy (MWh)', 'Totaal windenergie (MWh)',
                  'AveragePrice_Electricity_NonHousehold', 'AveragePrice_Electricity_Household']

# Split into train-test sets (use last two years for validation)
train_data = yearly_data.iloc[:-4]
test_data = yearly_data.iloc[-4:]

# Fit SARIMAX model
sarimax_model = sm.tsa.statespace.SARIMAX(train_data[target_variable],
                                          exog=train_data[exog_variables],
                                          order=(1,1,1),  # ARIMA parameters (p,d,q)
                                          seasonal_order=(1,1,1,5),  # Seasonal parameters (P,D,Q,s)
                                          enforce_stationarity=False,
                                          enforce_invertibility=False)

sarimax_results = sarimax_model.fit()


  warn('Too few observations to estimate starting parameters%s.'


In [7]:
# Forecast future yearly demand
future_exog = test_data[exog_variables]  # Exogenous variables for future years
yearly_predictions = sarimax_results.predict(start=len(train_data), 
                                             end=len(train_data) + len(test_data) - 1, 
                                             exog=future_exog)

# Store predictions for use in monthly model
yearly_data.loc[test_data.index, 'Predicted_Yearly_Demand'] = yearly_predictions


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Extract actual and predicted values for evaluation
actual_values = test_data[target_variable].dropna()
predicted_values = yearly_predictions.loc[actual_values.index]  # Align index

# Calculate evaluation metrics
mae = mean_absolute_error(actual_values, predicted_values)
mse = mean_squared_error(actual_values, predicted_values)
r2 = r2_score(actual_values, predicted_values)
mape = (abs(actual_values - predicted_values) / actual_values).mean() * 100

# Create a dataframe to display results
metrics_df = pd.DataFrame({
    "Metric": ["MAE", "MSE", "R² Score", "MAPE (%)"],
    "Value": [mae, mse, r2, mape]
})



In [9]:
metrics_df

Unnamed: 0,Metric,Value
0,MAE,3199949.0
1,MSE,19077440000000.0
2,R² Score,-1.826509
3,MAPE (%),3.184572
