In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from entsoe import EntsoePandasClient

In [2]:
my_api_key = os.environ.get('ENTSOE_API_KEY')
client = EntsoePandasClient(api_key=my_api_key)

# Parameters of the dataset

In [3]:
country_code = 'HU'
years = ['2017', '2018', '2019', '2020', '2021', '2022', '2023']

In [4]:
prices_folder_path = './data/prices'
loads_folder_path = './data/loads'

base_df_filename = f'./data/base_{country_code}_{years[0]}_{years[-1]}.csv'
base_load_df_filename = f'./data/base_load_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

In [5]:
def run_querry_day_ahead_prices(country_code, start_date, end_date):
    filename = f'price_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{prices_folder_path}/{filename}'):
        print(f'{prices_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{prices_folder_path}/{filename}', index_col=0)
    else:
        print(f'{prices_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

        df = client.query_day_ahead_prices(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{prices_folder_path}/{filename}')

    return df

In [6]:
def run_querry_load(country_code, start_date, end_date):
    filename = f'load_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{loads_folder_path}/{filename}'):
        print(f'{loads_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{loads_folder_path}/{filename}', index_col=0)
    else:
        print(f'{loads_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1)

        df =client.query_load(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{loads_folder_path}/{filename}')

    return df

# Create the base concatenated df

First we will create a base concatenated dataframe with all the data from the different files but only the prices

In [7]:
def get_base_prices(country_code, base_load_df_filename=base_load_df_filename, years=years):
    if os.path.exists(base_df_filename):
        print(f'{base_df_filename} exists, reading from file')
        df = pd.read_csv(base_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_df_filename} does not exist, concatenating from multiple files')
        df = pd.DataFrame()

        for year in years:
            start_date = f'{year}-01-01'
            end_date = f'{year}-12-31'

            df_temp = run_querry_day_ahead_prices(country_code, start_date, end_date)

            df = pd.concat([df, df_temp])

        df.columns = ['Price']
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df.to_csv(base_df_filename)
    return df

In [8]:
def load_15min_to_hourly(df):
    df['Date'] = df.index.floor('h')
    df['Load_avg'] = df.groupby('Date')['Actual Load'].transform('mean')
    df2 = df[['Date', 'Load_avg']]
    df2 = df2.drop_duplicates()
    return df

In [9]:
def get_base_load_avg(country_code, base_load_df_filename=base_load_df_filename, years=years):

    if os.path.exists(base_load_df_filename):
        print(f'{base_load_df_filename} exists, reading from file')
        df = pd.read_csv(base_load_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_load_df_filename} does not exist, concatenating from multiple files')
        df = pd.DataFrame()
        for year in years:
            start_date = f'{year}-01-01'
            end_date = f'{year}-12-31'

            df_temp = run_querry_load(country_code, start_date, end_date)

            df = pd.concat([df, df_temp])

        #df.columns = ['Load']
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df = load_15min_to_hourly(df)
        df = df[['Date', 'Load_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.to_csv(base_load_df_filename)
    return df

In [10]:
def get_prices_loads(country_code):
    prices_df = get_base_prices(country_code)
    load_avg_df = get_base_load_avg(country_code)
    df = pd.concat([prices_df, load_avg_df], axis=1)
    df = df.dropna()
    df = df.reset_index()
    df = df.rename(columns={'index': 'Datetime'})
    df = df.set_index('Datetime')
    return df

In [11]:
df = get_prices_loads(country_code)

./data/base_HU_2017_2023.csv does not exist, concatenating from multiple files
./data/prices/price_2017-01-01_2017-12-31_HU.csv does not exist, downloading from ENTSO-E


  df = pd.concat([df, df_temp])


./data/prices/price_2018-01-01_2018-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2019-01-01_2019-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2020-01-01_2020-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2021-01-01_2021-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2022-01-01_2022-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2023-01-01_2023-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/base_load_avgs_HU_2017_2023.csv does not exist, concatenating from multiple files
./data/loads/load_2017-01-01_2017-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/loads/load_2018-01-01_2018-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/loads/load_2019-01-01_2019-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/loads/load_2020-01-01_2020-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/loads/load_2

In [13]:
df

Unnamed: 0_level_0,Price,Load_avg
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00+00:00,57.25,4150.00
2017-01-01 01:00:00+00:00,50.21,4032.75
2017-01-01 02:00:00+00:00,44.04,3777.25
2017-01-01 03:00:00+00:00,32.81,3582.25
2017-01-01 04:00:00+00:00,28.41,3515.75
...,...,...
2023-12-31 19:00:00+00:00,25.05,4947.00
2023-12-31 20:00:00+00:00,23.33,4745.75
2023-12-31 21:00:00+00:00,9.82,4623.50
2023-12-31 22:00:00+00:00,10.68,4530.25


# Evaluation metric

In [15]:
def base_eval(y_true, y_pred):
    error = np.mean(abs(y_true - y_pred))
    print(f'Base evaluation (abs error): {error}')

In [16]:
def weighted_eval(y_true, y_pred, load):
    error = np.mean(abs((y_true - y_pred)*load))
    print(f'Weighted evaluation (abs error): {error}')

In [19]:
def do_modeling(model, train_df, test_df, input_cols, target_col):
    model.fit(train_df[input_cols], train_df[target_col])
    y_pred = model.predict(test_df[input_cols])
    y_true = test_df[target_col]
    base_eval(y_true, y_pred)
    weighted_eval(y_true, y_pred, test_df['Load_avg'])

# Baseline model

In [17]:
def baseline_model(df):
    y_pred = df['Price'].shift(7*24)
    df['Base_pred'] = y_pred
    return df

In [18]:
df = baseline_model(df)
start_date = '2023-01-01'
end_date = '2023-12-31'
base_eval(df.loc[start_date:end_date, 'Price'], df.loc[start_date:end_date, 'Base_pred'])
weighted_eval(df.loc[start_date:end_date, 'Price'], df.loc[start_date:end_date, 'Base_pred'], df.loc[start_date:end_date, 'Load_avg'])

Base evaluation (abs error): 27.71133820506965
Weighted evaluation (abs error): 133708.64686343912


# Notes

Ma 8as adatokkal 9ig a holnapit (a mait mind ismerem)
- baseline1 az 1 heti adat
- baseline2 hasonló időjárású nap

Kiértékelés:
- v1 abs hiba (hány eurót tévedtünk)
- v2 adott órában mennyi a load (termelés / fogyasztás), hiba súlyozva a teljes fogyasztással

Opciók
- recurrent nn predictor (?)
- gbm regressor
-- (előzö napi adatok, napelemek termelése, román adatok, hőmérséklet..., körny ország árai)
-- walk forward opt

keretrendszer
feture inportance alapján feature selection 
- változásuk követése !!!

(talán osztrák is számít, meg kell nézni melyik számít)

időjárási adatok (első körben tényadatok, nem előrejelzés) próbálkozni kell, drága lehet, kb kizárt 

- végén fontos és ***nem fontos*** változók listája

3 fontos időjárás (régiós, a napi bontás is jó)
- hány fok van (fűtés / hűtés)
- besugárzás
- szélerősség


-Hányszor volt negatív ár - statisztika róla (Meg tudjuk-e mondani, hogy mikor lesz negatív ár)
-- Ez is lehet célváltozó és kiértékelés

- Napi egy órát kikapcsoljuk, cél: mikor legyen (mert a többi órában többet tudunk termelni)
-- Meg lehet nézni, hogy melyik lesz a legdrágább óra
