In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from entsoe import EntsoePandasClient

In [2]:
my_api_key = os.environ.get('ENTSOE_API_KEY')
client = EntsoePandasClient(api_key=my_api_key)

# Parameters of the dataset

In [3]:
country_code = 'HU'
years = [('2017-01-01', '2017-12-31'),
         ('2018-01-01', '2018-12-31'),
         ('2019-01-01', '2019-12-31'),
         ('2020-01-01', '2020-12-31'),
         ('2021-01-01', '2021-12-31'),
         ('2022-01-01', '2022-12-31'),
         ('2023-01-01', '2023-12-31'),
         ('2024-01-01', '2024-03-24')]

In [4]:
prices_folder_path = './data/prices'
loads_folder_path = './data/loads'

base_df_filename = f'./data/base_{country_code}_{years[0]}_{years[-1]}.csv'
base_load_df_filename = f'./data/base_load_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

In [5]:
def run_querry_day_ahead_prices(country_code, start_date, end_date):
    filename = f'price_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{prices_folder_path}/{filename}'):
        print(f'{prices_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{prices_folder_path}/{filename}', index_col=0)
    else:
        print(f'{prices_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

        df = client.query_day_ahead_prices(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{prices_folder_path}/{filename}')

    return df

In [6]:
def run_querry_load(country_code, start_date, end_date):
    filename = f'load_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{loads_folder_path}/{filename}'):
        print(f'{loads_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{loads_folder_path}/{filename}', index_col=0)
    else:
        print(f'{loads_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1)

        df =client.query_load(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{loads_folder_path}/{filename}')

    return df

# Create the base concatenated df

First we will create a base concatenated dataframe with all the data from the different files but only the prices

In [7]:
def get_base_prices(country_code, base_load_df_filename=base_load_df_filename, years=years):
    if os.path.exists(base_df_filename):
        print(f'{base_df_filename} exists, reading from file')
        df = pd.read_csv(base_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_df_filename} does not exist, concatenating from multiple files')
        df = pd.DataFrame()

        for (start_date, end_date) in years:
            df_temp = run_querry_day_ahead_prices(country_code, start_date, end_date)

            df = pd.concat([df, df_temp])

        df.columns = ['Price']
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df.to_csv(base_df_filename)
    return df

In [8]:
def load_15min_to_hourly(df):
    df['Date'] = df.index.floor('h')
    df['Load_avg'] = df.groupby('Date')['Actual Load'].transform('mean')
    df2 = df[['Date', 'Load_avg']]
    df2 = df2.drop_duplicates()
    return df

In [9]:
def get_base_load_avg(country_code, base_load_df_filename=base_load_df_filename, years=years):
    if os.path.exists(base_load_df_filename):
        print(f'{base_load_df_filename} exists, reading from file')
        df = pd.read_csv(base_load_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_load_df_filename} does not exist, concatenating from multiple files')
        df = pd.DataFrame()
        for (start_date, end_date) in years:

            df_temp = run_querry_load(country_code, start_date, end_date)

            df = pd.concat([df, df_temp])
            
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df = load_15min_to_hourly(df)
        df = df[['Date', 'Load_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.columns = ['Load_avg']
        df.to_csv(base_load_df_filename)
    return df

In [10]:
def get_prices_loads(country_code):
    prices_df = get_base_prices(country_code)
    load_avg_df = get_base_load_avg(country_code)
    df = pd.concat([prices_df, load_avg_df], axis=1)
    df = df.dropna()
    df = df.reset_index()
    df = df.rename(columns={'index': 'Datetime'})
    #df = df.set_index('Datetime')
    return df

In [11]:
df = get_prices_loads(country_code)

./data/base_HU_('2017-01-01', '2017-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base_load_avgs_HU_('2017-01-01', '2017-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file


In [12]:
df

Unnamed: 0,Datetime,Price,Load_avg
0,2017-01-01 00:00:00+00:00,57.25,4150.00
1,2017-01-01 01:00:00+00:00,50.21,4032.75
2,2017-01-01 02:00:00+00:00,44.04,3777.25
3,2017-01-01 03:00:00+00:00,32.81,3582.25
4,2017-01-01 04:00:00+00:00,28.41,3515.75
...,...,...,...
63353,2024-03-24 19:00:00+00:00,85.22,5588.50
63354,2024-03-24 20:00:00+00:00,76.51,5461.00
63355,2024-03-24 21:00:00+00:00,73.74,5193.50
63356,2024-03-24 22:00:00+00:00,70.07,4897.25


# Create more input features

In [13]:
df['month'] = df['Datetime'].dt.month
df['day'] = df['Datetime'].dt.day
df['hour'] = df['Datetime'].dt.hour
df['weekday'] = df['Datetime'].dt.weekday
df['dayoftheweek'] = df['Datetime'].dt.dayofweek
df['date'] = df['Datetime'].dt.date

In [36]:
month_dummies = pd.get_dummies(df['month'], prefix='month', drop_first=True)
dayofweek_dummies = pd.get_dummies(df['dayoftheweek'], prefix='dayofweek', drop_first=True)
df = pd.concat([df, month_dummies, dayofweek_dummies], axis=1)

In [37]:
dummy_columns = month_dummies.columns.tolist() + dayofweek_dummies.columns.tolist()

In [14]:
df['Datetime-2d'] = df['Datetime'] + pd.to_timedelta(-2, unit='day')
df['Datetime-7d'] = df['Datetime'] + pd.to_timedelta(-7, unit='day')

terheles_map = df.set_index('Datetime')['Price']

df['Price-2d'] = df['Datetime-2d'].map(terheles_map)
df['Price-7d'] = df['Datetime-7d'].map(terheles_map)

In [15]:
df.isnull().sum()

Datetime          0
Price             0
Load_avg          0
month             0
day               0
hour              0
weekday           0
dayoftheweek      0
date              0
Datetime-2d       0
Datetime-7d       0
Price-2d         50
Price-7d        170
dtype: int64

In [16]:
df = df.dropna().reset_index(drop=True)

In [17]:
df

Unnamed: 0,Datetime,Price,Load_avg,month,day,hour,weekday,dayoftheweek,date,Datetime-2d,Datetime-7d,Price-2d,Price-7d
0,2017-01-08 00:00:00+00:00,36.13,4542.25,1,8,0,6,6,2017-01-08,2017-01-06 00:00:00+00:00,2017-01-01 00:00:00+00:00,40.68,57.25
1,2017-01-08 01:00:00+00:00,33.50,4447.25,1,8,1,6,6,2017-01-08,2017-01-06 01:00:00+00:00,2017-01-01 01:00:00+00:00,40.10,50.21
2,2017-01-08 02:00:00+00:00,33.55,4278.25,1,8,2,6,6,2017-01-08,2017-01-06 02:00:00+00:00,2017-01-01 02:00:00+00:00,38.50,44.04
3,2017-01-08 03:00:00+00:00,32.70,4170.25,1,8,3,6,6,2017-01-08,2017-01-06 03:00:00+00:00,2017-01-01 03:00:00+00:00,37.36,32.81
4,2017-01-08 04:00:00+00:00,33.00,4205.00,1,8,4,6,6,2017-01-08,2017-01-06 04:00:00+00:00,2017-01-01 04:00:00+00:00,37.40,28.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63181,2024-03-24 19:00:00+00:00,85.22,5588.50,3,24,19,6,6,2024-03-24,2024-03-22 19:00:00+00:00,2024-03-17 19:00:00+00:00,114.84,87.46
63182,2024-03-24 20:00:00+00:00,76.51,5461.00,3,24,20,6,6,2024-03-24,2024-03-22 20:00:00+00:00,2024-03-17 20:00:00+00:00,90.77,77.89
63183,2024-03-24 21:00:00+00:00,73.74,5193.50,3,24,21,6,6,2024-03-24,2024-03-22 21:00:00+00:00,2024-03-17 21:00:00+00:00,78.11,72.33
63184,2024-03-24 22:00:00+00:00,70.07,4897.25,3,24,22,6,6,2024-03-24,2024-03-22 22:00:00+00:00,2024-03-17 22:00:00+00:00,75.69,72.10


In [18]:
from workalendar.europe import Hungary

cal = Hungary()
year_list = df['Datetime'].dt.year.unique()

In [19]:
holiday_df = pd.DataFrame()
for year in year_list:
    holidays = cal.holidays(year)
    temp_df = pd.DataFrame(holidays, columns=['date', 'holiday_name'])
    holiday_df = pd.concat([holiday_df, temp_df],
                           axis=0).reset_index(drop=True)

In [20]:
holiday_map = holiday_df.set_index('date')['holiday_name']
df['holiday_name'] = df['date'].map(holiday_map)
df['is_holiday'] = 0
df.loc[df['holiday_name'].notnull(), 'is_holiday'] = 1

df.drop(columns=['holiday_name'], inplace=True)

# Evaluation metric

In [21]:
def base_eval(y_true, y_pred):
    error = np.mean(abs(y_true - y_pred))
    print(f'Base evaluation (abs error): {error}')

In [22]:
def weighted_eval(y_true, y_pred, load):
    error = np.mean(abs((y_true - y_pred)*load))
    print(f'Weighted evaluation (abs error): {error}')

In [23]:
def do_modeling(model, train_df, test_df, input_cols, target_col):
    model.fit(train_df[input_cols], train_df[target_col])
    y_pred = model.predict(test_df[input_cols])
    y_true = test_df[target_col]
    base_eval(y_true, y_pred)
    weighted_eval(y_true, y_pred, test_df['Load_avg'])

# Testing on models

In [40]:
df.head(1).T

Unnamed: 0,0
Datetime,2017-01-08 00:00:00+00:00
Price,36.13
Load_avg,4542.25
month,1
day,8
hour,0
weekday,6
dayoftheweek,6
date,2017-01-08
Datetime-2d,2017-01-06 00:00:00+00:00


In [25]:
test_end_date = '2024-03-17'

In [41]:
in_cols = ['Price-2d', 'Price-7d', 'is_holiday'] + dummy_columns
target_col = 'Price'

## Baseline

In [28]:
def baseline_model(df):
    y_pred = df['Price'].shift(7*24)
    return y_pred

In [29]:
base_pred = baseline_model(df)
start_date = '2023-01-01'
end_date = '2024-03-17'
base_eval(df.loc[df['Datetime'] >= test_end_date, 'Price'], base_pred)
weighted_eval(df.loc[df['Datetime'] >= test_end_date, 'Price'], base_pred, df.loc[df['Datetime'] >= test_end_date, 'Load_avg'])

Base evaluation (abs error): 16.567239583333333
Weighted evaluation (abs error): 77618.14497395833


## ARIMA

As another base model

In [30]:
from statsmodels.tsa.arima.model import ARIMA

In [35]:
df_train = df.loc[df['Datetime'] < test_end_date]

In [32]:
#create an ARIMA model, train it on 2017-2022 and forecast 2023
model = ARIMA(df_train.loc[:,'Price'], order=(12,1,1))
model = model.fit()

In [33]:
arima_pred = model.forecast(steps=len(df.loc[df['Datetime'] >= test_end_date]))

In [34]:
# evaluate the model
base_eval(df.loc[df['Datetime'] >= test_end_date, 'Price'], arima_pred)
weighted_eval(df.loc[df['Datetime'] >= test_end_date, 'Price'], arima_pred, df.loc[df['Datetime'] >= test_end_date, 'Load_avg'])

Base evaluation (abs error): 21.949720526947583
Weighted evaluation (abs error): 107170.9952381644


## Gradient Boosting Regressor

In [45]:
from sklearn.ensemble import GradientBoostingRegressor

In [46]:
gbr = GradientBoostingRegressor(random_state=42)

In [47]:
train = df.loc[df['Datetime'] <= test_end_date]
test = df.loc[df['Datetime'] > test_end_date]

X_train = train[in_cols]
y_train = train[target_col]
X_test = test[in_cols]
y_test = test[target_col]

In [48]:
gbr.fit(X_train, y_train)

In [49]:
gbr_pred = gbr.predict(X_test)

In [50]:
base_eval(y_test, gbr_pred)
weighted_eval(y_test, gbr_pred, test['Load_avg'])

Base evaluation (abs error): 14.805669309386053
Weighted evaluation (abs error): 69621.44422175427


In [55]:
# zip the feature importances with the column names and sort them from the most to the least important
feature_importances = list(zip(X_train.columns, gbr.feature_importances_))
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
feature_importances

[('Price-7d', 0.5677836940360977),
 ('Price-2d', 0.39767717371214883),
 ('month_8', 0.008600543223050754),
 ('dayofweek_1', 0.007596043219591415),
 ('dayofweek_6', 0.0063811389845657195),
 ('dayofweek_5', 0.0043213506221762244),
 ('month_7', 0.002069617577271918),
 ('is_holiday', 0.0014900338905421378),
 ('month_12', 0.0013789523421455965),
 ('month_10', 0.0010307932365459867),
 ('dayofweek_4', 0.0004669926974629789),
 ('dayofweek_2', 0.00033339725757474575),
 ('month_11', 0.00022160011072792685),
 ('month_2', 0.00017413239434455665),
 ('month_9', 0.0001582493054246263),
 ('month_3', 0.00012084301488255585),
 ('dayofweek_3', 7.779759064690573e-05),
 ('month_4', 6.701791691636785e-05),
 ('month_6', 4.1029778171940225e-05),
 ('month_5', 9.599089711245945e-06)]

# Notes

Ma 8as adatokkal 9ig a holnapit (a mait mind ismerem)
- baseline1 az 1 heti adat
- baseline2 hasonló időjárású nap

Kiértékelés:
- v1 abs hiba (hány eurót tévedtünk)
- v2 adott órában mennyi a load (termelés / fogyasztás), hiba súlyozva a teljes fogyasztással

Opciók
- recurrent nn predictor (?)
- gbm regressor
-- (előzö napi adatok, napelemek termelése, román adatok, hőmérséklet..., körny ország árai)
-- walk forward opt

keretrendszer
feture inportance alapján feature selection 
- változásuk követése !!!

(talán osztrák is számít, meg kell nézni melyik számít)

időjárási adatok (első körben tényadatok, nem előrejelzés) próbálkozni kell, drága lehet, kb kizárt 

- végén fontos és ***nem fontos*** változók listája

3 fontos időjárás (régiós, a napi bontás is jó)
- hány fok van (fűtés / hűtés)
- besugárzás
- szélerősség


-Hányszor volt negatív ár - statisztika róla (Meg tudjuk-e mondani, hogy mikor lesz negatív ár)
-- Ez is lehet célváltozó és kiértékelés

- Napi egy órát kikapcsoljuk, cél: mikor legyen (mert a többi órában többet tudunk termelni)
-- Meg lehet nézni, hogy melyik lesz a legdrágább óra
