In [106]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from entsoe import EntsoePandasClient

In [107]:
my_api_key = os.environ.get('ENTSOE_API_KEY')
client = EntsoePandasClient(api_key=my_api_key)

# Parameters of the dataset

In [108]:
prices_folder_path = './data/prices'
loads_folder_path = './data/loads'

In [109]:
year = '2023'
start_date = f'{year}-01-01'
end_date = f'{year}-12-31'
country_code = 'HU'

start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
end_ts = pd.Timestamp(end_date, tz='Europe/Brussels')

In [110]:
years = ['2017', '2018', '2019', '2020', '2021', '2022', '2023']

In [111]:
def run_querry_day_ahead_prices(country_code, start_date, end_date):
    filename = f"{start_date}_{end_date}_{country_code}.csv"

    print(filename)

    if os.path.exists(f'{prices_folder_path}/{filename}'):
        print("The file exists, no need to download data from ENTSO-E")
    else:
        print("The file does not exist, download data from ENTSO-E")

        df = client.query_day_ahead_prices(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        df.to_csv(f'{prices_folder_path}/{filename}')

In [112]:
def run_querry_load(country_code, start_date, end_date):
    filename = f"load_{start_date}_{end_date}_{country_code}.csv"

    print(filename)

    if os.path.exists(f'{loads_folder_path}/{filename}'):
        print("The file exists, no need to download data from ENTSO-E")
    else:
        print("The file does not exist, download data from ENTSO-E")

        df =client.query_load(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        df.to_csv(f'{loads_folder_path}/{filename}')

In [113]:
for year in years:
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    country_code = 'HU'

    start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
    end_ts = pd.Timestamp(end_date, tz='Europe/Brussels')
    run_querry_load(country_code, start_date, end_date)

load_2017-01-01_2017-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2018-01-01_2018-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2019-01-01_2019-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2020-01-01_2020-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2021-01-01_2021-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2022-01-01_2022-12-31_HU.csv
The file exists, no need to download data from ENTSO-E
load_2023-01-01_2023-12-31_HU.csv
The file exists, no need to download data from ENTSO-E


# Create the base concatenated df

First we will create a base concatenated dataframe with all the data from the different files but only the prices

In [114]:
base_df_filename = f'./data/base_{country_code}_{years[0]}_{years[-1]}.csv'

if os.path.exists(base_df_filename):
    print("The file exists, reading data from file")
    df = pd.read_csv(base_df_filename, index_col=0, parse_dates=True)
else:
    print("The file does not exist, concatenating data from ENTSO-E")
    df = pd.DataFrame()

    for year in years:
        filename = f"{year}-01-01_{year}-12-31_{country_code}.csv"
        if os.path.exists(f'{prices_folder_path}/{filename}'):
            df_temp = pd.read_csv(f'{prices_folder_path}/{filename}', index_col=0, parse_dates=True)
            df = pd.concat([df, df_temp])

    df.columns = ['Price']
    df['Datetime'] = df.index
    df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
    df.reset_index(drop=True, inplace=True)
    #order by datetime
    df = df.sort_values(by='Datetime')
    df = df.set_index('Datetime')
    #add 1 hour to the datetime
    df.index = df.index + pd.DateOffset(hours=1)
    df.to_csv(base_df_filename)
    df

The file exists, reading data from file


In [115]:
df

Unnamed: 0_level_0,Price
Datetime,Unnamed: 1_level_1
2017-01-01 00:00:00+00:00,57.25
2017-01-01 01:00:00+00:00,50.21
2017-01-01 02:00:00+00:00,44.04
2017-01-01 03:00:00+00:00,32.81
2017-01-01 04:00:00+00:00,28.41
...,...
2023-12-30 20:00:00+00:00,62.01
2023-12-30 21:00:00+00:00,53.41
2023-12-30 22:00:00+00:00,50.10
2023-12-30 23:00:00+00:00,43.23


Concatenate the loads

In [116]:
def load_15min_to_hourly(df):
    df['Date'] = df.index.floor('h')
    df['Load_avg'] = df.groupby('Date')['Actual Load'].transform('mean')
    df2 = df[['Date', 'Load_avg']]
    df2 = df2.drop_duplicates()
    df2


    return df

In [117]:
base__price_df_filename = f'./data/base_loads_{country_code}_{years[0]}_{years[-1]}.csv'

if os.path.exists(base__price_df_filename):
    print("The file exists, reading data from file")
    load_df = pd.read_csv(base__price_df_filename, index_col=0, parse_dates=True)
else:
    print("The file does not exist, concatenating data from files")
    load_df = pd.DataFrame()
    for year in years:
        filename = f"load_{year}-01-01_{year}-12-31_{country_code}.csv"
        if os.path.exists(f'{loads_folder_path}/{filename}'):
            df_temp = pd.read_csv(f'{loads_folder_path}/{filename}', index_col=0, parse_dates=True)
            load_df = pd.concat([load_df, df_temp])

    #df.columns = ['Load']
    load_df['Datetime'] = load_df.index
    load_df['Datetime'] = pd.to_datetime(load_df['Datetime'], utc=True)
    load_df.reset_index(drop=True, inplace=True)
    #order by datetime
    load_df = load_df.sort_values(by='Datetime')
    load_df = load_df.set_index('Datetime')
    #add 1 hour to the datetime
    load_df.index = load_df.index + pd.DateOffset(hours=1)
    load_df.to_csv(base__price_df_filename)
    load_df

The file exists, reading data from file


In [118]:
load_df = load_15min_to_hourly(load_df)
load_df

Unnamed: 0_level_0,Actual Load,Date,Load_avg
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01 00:00:00+00:00,4197.0,2017-01-01 00:00:00+00:00,4150.00
2017-01-01 00:15:00+00:00,4176.0,2017-01-01 00:00:00+00:00,4150.00
2017-01-01 00:30:00+00:00,4121.0,2017-01-01 00:00:00+00:00,4150.00
2017-01-01 00:45:00+00:00,4106.0,2017-01-01 00:00:00+00:00,4150.00
2017-01-01 01:00:00+00:00,4113.0,2017-01-01 01:00:00+00:00,4032.75
...,...,...,...
2023-12-30 22:45:00+00:00,4755.0,2023-12-30 22:00:00+00:00,4845.50
2023-12-30 23:00:00+00:00,4687.0,2023-12-30 23:00:00+00:00,4587.75
2023-12-30 23:15:00+00:00,4597.0,2023-12-30 23:00:00+00:00,4587.75
2023-12-30 23:30:00+00:00,4578.0,2023-12-30 23:00:00+00:00,4587.75


In [119]:
load_df = load_df[['Date', 'Load_avg']].drop_duplicates()
load_df.drop(columns=['Date'], inplace=True)
load_df

Unnamed: 0_level_0,Load_avg
Datetime,Unnamed: 1_level_1
2017-01-01 00:00:00+00:00,4150.00
2017-01-01 01:00:00+00:00,4032.75
2017-01-01 02:00:00+00:00,3777.25
2017-01-01 03:00:00+00:00,3582.25
2017-01-01 04:00:00+00:00,3515.75
...,...
2023-12-30 19:00:00+00:00,5379.75
2023-12-30 20:00:00+00:00,5280.25
2023-12-30 21:00:00+00:00,5077.50
2023-12-30 22:00:00+00:00,4845.50


Concatenate the Load_avg to the base df

In [125]:
df = pd.concat([df, load_df], axis=1)
df

Unnamed: 0_level_0,Price,Load_avg
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00+00:00,57.25,4150.00
2017-01-01 01:00:00+00:00,50.21,4032.75
2017-01-01 02:00:00+00:00,44.04,3777.25
2017-01-01 03:00:00+00:00,32.81,3582.25
2017-01-01 04:00:00+00:00,28.41,3515.75
...,...,...
2023-12-30 20:00:00+00:00,62.01,5280.25
2023-12-30 21:00:00+00:00,53.41,5077.50
2023-12-30 22:00:00+00:00,50.10,4845.50
2023-12-30 23:00:00+00:00,43.23,4587.75


# Evaluation metric

In [126]:
def base_eval(y_true, y_pred):
    error = np.mean(abs(y_true - y_pred))
    print(f'Base evaluation (abs error): {error}')

In [127]:
def weighted_eval(y_true, y_pred, load):
    error = np.mean(abs((y_true - y_pred)*load))
    print(f'Weighted evaluation (abs error): {error}')

# Baseline model

In [128]:
def baseline_model(df):
    y_pred = df['Price'].shift(7*24)
    df['Base_pred'] = y_pred
    return df

In [130]:
df = baseline_model(df)
start_date = '2023-01-01'
end_date = '2023-12-31'
base_eval(df.loc[start_date:end_date, 'Price'], df.loc[start_date:end_date, 'Base_pred'])
weighted_eval(df.loc[start_date:end_date, 'Price'], df.loc[start_date:end_date, 'Base_pred'], df.loc[start_date:end_date, 'Load_avg'])

Base evaluation (abs error): 27.488985921941172
Weighted evaluation (abs error): 132687.2091716281


# Notes

Ma 8as adatokkal 9ig a holnapit (a mait mind ismerem)
- baseline1 az 1 heti adat
- baseline2 hasonló időjárású nap

Kiértékelés:
- v1 abs hiba (hány eurót tévedtünk)
- v2 adott órában mennyi a load (termelés / fogyasztás), hiba súlyozva a teljes fogyasztással

Opciók
- recurrent nn predictor (?)
- gbm regressor
-- (előzö napi adatok, napelemek termelése, román adatok, hőmérséklet..., körny ország árai)
-- walk forward opt

keretrendszer
feture inportance alapján feature selection 
- változásuk követése !!!

(talán osztrák is számít, meg kell nézni melyik számít)

időjárási adatok (első körben tényadatok, nem előrejelzés) próbálkozni kell, drága lehet, kb kizárt 

- végén fontos és ***nem fontos*** változók listája

3 fontos időjárás (régiós, a napi bontás is jó)
- hány fok van (fűtés / hűtés)
- besugárzás
- szélerősség


-Hányszor volt negatív ár - statisztika róla (Meg tudjuk-e mondani, hogy mikor lesz negatív ár)
-- Ez is lehet célváltozó és kiértékelés

- Napi egy órát kikapcsoljuk, cél: mikor legyen (mert a többi órában többet tudunk termelni)
-- Meg lehet nézni, hogy melyik lesz a legdrágább óra
