In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
# Load your datasets
main_df = pd.read_csv("processed_data/data_2006_2023.csv")
renewable_df = pd.read_csv("processed_data/renwableEnergy.csv")

In [3]:
main_df

Unnamed: 0,loadConsumption,date,datetime,Hour,Year
0,10215.0000,2006-01-01,2005-12-31 23:00:00,0,2006
1,9979.0000,2006-01-01,2006-01-01 00:00:00,1,2006
2,9460.0000,2006-01-01,2006-01-01 01:00:00,2,2006
3,8833.0000,2006-01-01,2006-01-01 02:00:00,3,2006
4,8525.0000,2006-01-01,2006-01-01 03:00:00,4,2006
...,...,...,...,...,...
157767,12673.4650,2023-12-31,2023-12-31 19:00:00,20,2023
157768,12366.1950,2023-12-31,2023-12-31 20:00:00,21,2023
157769,12143.1775,2023-12-31,2023-12-31 21:00:00,22,2023
157770,11993.1200,2023-12-31,2023-12-31 22:00:00,23,2023


In [4]:
main_df["datetime"] = pd.to_datetime(main_df["datetime"])

In [5]:
main_df.set_index("datetime", inplace=True)

In [6]:
dailyLoad_data = main_df.groupby("date")["loadConsumption"].sum().reset_index()

In [7]:
dailyLoad_data

Unnamed: 0,date,loadConsumption
0,2006-01-01,255022.0000
1,2006-01-02,314663.0000
2,2006-01-03,326082.0000
3,2006-01-04,328763.0000
4,2006-01-05,329469.0000
...,...,...
6569,2023-12-27,305610.2900
6570,2023-12-28,326312.3400
6571,2023-12-29,314575.3300
6572,2023-12-30,290563.1625


In [8]:
# Ensure date column is datetime type
dailyLoad_data['date'] = pd.to_datetime(dailyLoad_data['date'])

# Extract year and merge
dailyLoad_data['Year'] = dailyLoad_data['date'].dt.year
daily_data = dailyLoad_data.copy()

In [9]:
merged_res = pd.merge(daily_data, renewable_df, on="Year", how="left").dropna()

# Select renewable features to test
renewable_features = ['TotalSolarEnergy', 'Totaal windenergie', 'TotalBiomass']

# Split datasets
df_no_ren = merged_res[['date', 'loadConsumption']].copy()
df_with_ren = merged_res[['date', 'loadConsumption'] + renewable_features].copy()

# Set datetime index
df_no_ren.set_index('date', inplace=True)
df_with_ren.set_index('date', inplace=True)

In [30]:
# Function to evaluate SARIMAX for given dataset
def evaluate_sarimax(df, exog_features=None, forecast_horizon=1):
    # Train/test split (80/20)
    split_idx = int(len(df) * 0.8)
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]

    # Fit SARIMAX
    model = SARIMAX(
        train['loadConsumption'],
        exog=train[exog_features] if exog_features else None,
        order=(1, 0, 1),
        seasonal_order=(1, 0, 1, 7),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    model_fit = model.fit(disp=False)

    # Forecast
    forecast = model_fit.forecast(
        steps=forecast_horizon,
        exog=test[exog_features].iloc[:forecast_horizon] if exog_features else None
    )

    # Ground truth
    actual = test['loadConsumption'].iloc[:forecast_horizon]

    # Evaluation metrics
    rmse = mean_squared_error(actual, forecast, squared=False)
    mae = mean_absolute_error(actual, forecast)
    mape = np.mean(np.abs((actual - forecast) / actual)) * 100

    # Normalize by mean of actuals
    mean_val = actual.mean()
    nrmse = (rmse / mean_val) * 100
    nmae = (mae / mean_val) * 100

    return {
        'nRMSE (%)': round(nrmse, 2),
        'nMAE (%)': round(nmae, 2),
        'MAPE (%)': round(mape, 2)
    }

# Run evaluations
results_no_ren_h1 = evaluate_sarimax(df_no_ren, exog_features=None, forecast_horizon=1)
results_with_ren_h1 = evaluate_sarimax(df_with_ren, exog_features=renewable_features, forecast_horizon=1)

results_no_ren_h180 = evaluate_sarimax(df_no_ren, exog_features=None, forecast_horizon=180)
results_with_ren_h180 = evaluate_sarimax(df_with_ren, exog_features=renewable_features, forecast_horizon=180)

# Display results
print("SARIMAX without renewables h:1:", results_no_ren_h1)
print("SARIMAX with renewables h:1:", results_with_ren_h1)

print("SARIMAX without renewables h:180 :", results_no_ren_h180)
print("SARIMAX with renewables h:180:", results_with_ren_h180)



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMAX without renewables h:1: {'nRMSE (%)': 3.25, 'nMAE (%)': 3.25, 'MAPE (%)': 3.25}
SARIMAX with renewables h:1: {'nRMSE (%)': 3.89, 'nMAE (%)': 3.89, 'MAPE (%)': 3.89}
SARIMAX without renewables h:180 : {'nRMSE (%)': 11.87, 'nMAE (%)': 9.79, 'MAPE (%)': 9.42}
SARIMAX with renewables h:180: {'nRMSE (%)': 6.78, 'nMAE (%)': 5.5, 'MAPE (%)': 5.36}


In [31]:
gdp = pd.read_csv("processed_data/GDP.csv").groupby("Year", as_index=False)['GDP'].mean()
pop = pd.read_csv("processed_data/populationNL.csv")

# Merge socioeconomic data
socio = pd.merge(gdp, pop, on='Year')

In [32]:
socia_df = pd.merge(daily_data, socio, on='Year', how='left').dropna()

In [33]:
socio_features = ['GDP', 'Population']

In [34]:
# Create datasets
df_socio = socia_df[['date', 'loadConsumption', 'GDP', 'Population']]
df_socio.set_index('date', inplace=True)

# Daily aggregation
df_socio_daily = df_socio.resample('D').mean().dropna()

In [35]:
results_with_socio_1 = evaluate_sarimax(df_socio, exog_features=socio_features, forecast_horizon=1)
results_with_socio_180 = evaluate_sarimax(df_socio, exog_features=socio_features, forecast_horizon=180)

print("SARIMAX with socio_1:", results_with_socio_1)
print("SARIMAX with socio_180   :", results_with_socio_180)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMAX with socio_1: {'nRMSE (%)': 4.0, 'nMAE (%)': 4.0, 'MAPE (%)': 4.0}
SARIMAX with socio_180   : {'nRMSE (%)': 7.07, 'nMAE (%)': 5.73, 'MAPE (%)': 5.58}
