In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [6]:
# Load your datasets
main_df = pd.read_csv("processed_data/data_2006_2023.csv")
renewable_df = pd.read_csv("processed_data/renwableEnergy.csv")

In [7]:
main_df

Unnamed: 0,loadConsumption,date,datetime,Hour,Year
0,10215.0000,2006-01-01,2005-12-31 23:00:00,0,2006
1,9979.0000,2006-01-01,2006-01-01 00:00:00,1,2006
2,9460.0000,2006-01-01,2006-01-01 01:00:00,2,2006
3,8833.0000,2006-01-01,2006-01-01 02:00:00,3,2006
4,8525.0000,2006-01-01,2006-01-01 03:00:00,4,2006
...,...,...,...,...,...
157767,12673.4650,2023-12-31,2023-12-31 19:00:00,20,2023
157768,12366.1950,2023-12-31,2023-12-31 20:00:00,21,2023
157769,12143.1775,2023-12-31,2023-12-31 21:00:00,22,2023
157770,11993.1200,2023-12-31,2023-12-31 22:00:00,23,2023


In [8]:
main_df["datetime"] = pd.to_datetime(main_df["datetime"])

In [9]:
main_df.set_index("datetime", inplace=True)

In [10]:
dailyLoad_data = main_df.groupby("date")["loadConsumption"].sum().reset_index()

In [11]:
dailyLoad_data

Unnamed: 0,date,loadConsumption
0,2006-01-01,255022.0000
1,2006-01-02,314663.0000
2,2006-01-03,326082.0000
3,2006-01-04,328763.0000
4,2006-01-05,329469.0000
...,...,...
6569,2023-12-27,305610.2900
6570,2023-12-28,326312.3400
6571,2023-12-29,314575.3300
6572,2023-12-30,290563.1625


In [12]:
# Ensure date column is datetime type
dailyLoad_data['date'] = pd.to_datetime(dailyLoad_data['date'])

# Extract year and merge
dailyLoad_data['Year'] = dailyLoad_data['date'].dt.year
merged_df = pd.merge(dailyLoad_data, renewable_df, on="Year", how="left").dropna()

# Select renewable features to test
renewable_features = ['TotalSolarEnergy', 'Totaal windenergie', 'TotalBiomass']

# Split datasets
df_no_ren = merged_df[['date', 'loadConsumption']].copy()
df_with_ren = merged_df[['date', 'loadConsumption'] + renewable_features].copy()

# Set datetime index
df_no_ren.set_index('date', inplace=True)
df_with_ren.set_index('date', inplace=True)

In [13]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Function to evaluate SARIMAX for given dataset
def evaluate_sarimax(df, exog_features=None, forecast_horizon=1):
    # Train/test split (80/20)
    split_idx = int(len(df) * 0.8)
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]

    # Fit SARIMAX
    model = SARIMAX(
        train['loadConsumption'],
        exog=train[exog_features] if exog_features else None,
        order=(2, 0, 3),
        seasonal_order=(1, 0, 1, 7),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    model_fit = model.fit(disp=False)

    # Forecast
    forecast = model_fit.forecast(
        steps=forecast_horizon,
        exog=test[exog_features].iloc[:forecast_horizon] if exog_features else None
    )

    # Ground truth
    actual = test['loadConsumption'].iloc[:forecast_horizon]

    # Evaluation metrics
    rmse = mean_squared_error(actual, forecast, squared=False)
    mae = mean_absolute_error(actual, forecast)
    mape = np.mean(np.abs((actual - forecast) / actual)) * 100

    # Normalize by mean of actuals
    mean_val = actual.mean()
    nrmse = (rmse / mean_val) * 100
    nmae = (mae / mean_val) * 100

    return {
        'nRMSE (%)': round(nrmse, 2),
        'nMAE (%)': round(nmae, 2),
        'MAPE (%)': round(mape, 2)
    }

# Run evaluations
results_no_ren = evaluate_sarimax(df_no_ren, exog_features=None, forecast_horizon=1)
results_with_ren = evaluate_sarimax(df_with_ren, exog_features=renewable_features, forecast_horizon=1)

# Display results
print("SARIMAX without renewables:", results_no_ren)
print("SARIMAX with renewables   :", results_with_ren)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMAX without renewables: {'nRMSE (%)': 3.14, 'nMAE (%)': 3.14, 'MAPE (%)': 3.14}
SARIMAX with renewables   : {'nRMSE (%)': 2.59, 'nMAE (%)': 2.59, 'MAPE (%)': 2.59}


In [14]:
# Run evaluations
results_no_ren = evaluate_sarimax(df_no_ren, exog_features=None, forecast_horizon=180)
results_with_ren = evaluate_sarimax(df_with_ren, exog_features=renewable_features, forecast_horizon=180)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [15]:
print("SARIMAX without renewables:", results_no_ren)
print("SARIMAX with renewables   :", results_with_ren)

SARIMAX without renewables: {'nRMSE (%)': 24.88, 'nMAE (%)': 20.52, 'MAPE (%)': 20.49}
SARIMAX with renewables   : {'nRMSE (%)': 7.24, 'nMAE (%)': 5.75, 'MAPE (%)': 5.7}


In [16]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load data
df = pd.read_csv("processed_data/data_2006_2023.csv")
df['datetime'] = pd.to_datetime(df['datetime'])
df['Year'] = df['datetime'].dt.year
df = df[['datetime', 'loadConsumption', 'Year']]

gdp = pd.read_csv("processed_data/GDP.csv").groupby("Year", as_index=False)['GDP'].mean()
pop = pd.read_csv("processed_data/populationNL.csv")
pop['Population'] = pop['Population'].str.replace(',', '.').astype(float)

# Merge socioeconomic data
socio = pd.merge(gdp, pop, on='Year')
merged = pd.merge(df, socio, on='Year', how='left').dropna()

# Create datasets
df_socio = merged[['datetime', 'loadConsumption', 'GDP', 'Population']]
df_no_socio = merged[['datetime', 'loadConsumption']]
df_socio.set_index('datetime', inplace=True)
df_no_socio.set_index('datetime', inplace=True)

# Daily aggregation
df_socio_daily = df_socio.resample('D').mean().dropna()
df_no_socio_daily = df_no_socio.resample('D').mean().loc[df_socio_daily.index]

# Forecast function
def evaluate_sarimax(df, exog_features=None, horizon=1, simple_model=False):
    split = int(len(df) * 0.9)
    train, test = df.iloc[:split], df.iloc[split:]

    order = (1, 1, 1) if simple_model else (2, 1, 2)
    seasonal = (0, 1, 1, 7) if simple_model else (1, 1, 1, 7)

    model = SARIMAX(
        train['loadConsumption'],
        exog=train[exog_features] if exog_features else None,
        order=order,
        seasonal_order=seasonal,
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    model_fit = model.fit(disp=False)

    forecast = model_fit.forecast(
        steps=horizon,
        exog=test[exog_features].iloc[:horizon] if exog_features else None
    )
    actual = test['loadConsumption'].iloc[:horizon]

    rmse = mean_squared_error(actual, forecast, squared=False)
    mae = mean_absolute_error(actual, forecast)
    mape = np.mean(np.abs((actual - forecast) / actual)) * 100
    mean_val = actual.mean()

    return {
        'nRMSE (%)': round(rmse / mean_val * 100, 2),
        'nMAE (%)': round(mae / mean_val * 100, 2),
        'MAPE (%)': round(mape, 2)
    }

# Run evaluations
results_1_nosocio = evaluate_sarimax(df_no_socio_daily, horizon=1)
results_1_socio   = evaluate_sarimax(df_socio_daily, exog_features=['GDP', 'Population'], horizon=1)

results_180_nosocio = evaluate_sarimax(df_no_socio_daily, horizon=180, simple_model=True)
results_180_socio   = evaluate_sarimax(df_socio_daily, exog_features=['GDP', 'Population'], horizon=180, simple_model=True)

# Show results
print("1-Day Forecast WITHOUT socioeconomic:", results_1_nosocio)
print("1-Day Forecast WITH socioeconomic:   ", results_1_socio)
print("180-Day Forecast WITHOUT socioeconomic:", results_180_nosocio)
print("180-Day Forecast WITH socioeconomic:   ", results_180_socio)




1-Day Forecast WITHOUT socioeconomic: {'nRMSE (%)': 3.26, 'nMAE (%)': 3.26, 'MAPE (%)': 3.26}
1-Day Forecast WITH socioeconomic:    {'nRMSE (%)': 2.91, 'nMAE (%)': 2.91, 'MAPE (%)': 2.91}
180-Day Forecast WITHOUT socioeconomic: {'nRMSE (%)': 6.59, 'nMAE (%)': 5.44, 'MAPE (%)': 5.53}
180-Day Forecast WITH socioeconomic:    {'nRMSE (%)': 6.42, 'nMAE (%)': 5.28, 'MAPE (%)': 5.32}
