In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from entsoe import EntsoePandasClient

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
my_api_key = os.environ.get('ENTSOE_API_KEY')
client = EntsoePandasClient(api_key=my_api_key)

# Parameters of the dataset

In [4]:
country_code = 'HU'
years = [('2018-01-01', '2018-12-31'),
         ('2019-01-01', '2019-12-31'),
         ('2020-01-01', '2020-12-31'),
         ('2021-01-01', '2021-12-31'),
         ('2022-01-01', '2022-12-31'),
         ('2023-01-01', '2023-12-31'),
         ('2024-01-01', '2024-03-24')]

In [5]:
prices_folder_path = './data/prices'
loads_folder_path = './data/loads'
wind_solar_forecast_folder_path = './data/wind_solar_forecast'
weather_folder_path = './data/weather'
base_folder_path = './data/base'

In [6]:
def get_base_df_filename(country_code, years):
    return f'{base_folder_path}/base_price_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_load_df_filename(country_code, years):
    return f'{base_folder_path}/base_load_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_wind_solar_forecast_df_filename(country_code, years):
    return f'{base_folder_path}/base_wind_solar_forecast_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_weather_df_filename(country_code, years):
    return f'{base_folder_path}/base_weather_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

In [7]:
def run_querry_day_ahead_prices(country_code, start_date, end_date):
    filename = f'price_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{prices_folder_path}/{filename}'):
        print(f'{prices_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{prices_folder_path}/{filename}', index_col=0)
    else:
        print(f'{prices_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

        df = client.query_day_ahead_prices(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{prices_folder_path}/{filename}')

    return df

In [8]:
def run_querry_load(country_code, start_date, end_date):
    filename = f'load_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{loads_folder_path}/{filename}'):
        print(f'{loads_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{loads_folder_path}/{filename}', index_col=0)
    else:
        print(f'{loads_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1)

        df =client.query_load(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{loads_folder_path}/{filename}')

    return df

In [9]:
def run_querry_wind_solar_forecast(country_code, start_date, end_date):
    filename = f'wind_solar_forecast_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    if os.path.exists(f'{wind_solar_forecast_folder_path}/{filename}'):
        print(f'{wind_solar_forecast_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{wind_solar_forecast_folder_path}/{filename}', index_col=0)
    else:
        print(f'{wind_solar_forecast_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        #set start time to 00:00:00 and end time to 23:59:59
        start_ts = pd.Timestamp(start_date, tz='Europe/Brussels')
        end_ts = pd.Timestamp(end_date, tz='Europe/Brussels') + pd.Timedelta(days=1)

        df =client.query_wind_and_solar_forecast(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{wind_solar_forecast_folder_path}/{filename}')

    return df

# Create the base concatenated df

First we will create a base concatenated dataframe with all the data from the different files but only the prices

In [10]:
def get_base_prices(country_code, years=years):
    base_df_filename = get_base_df_filename(country_code, years)
    if os.path.exists(base_df_filename):
        print(f'{base_df_filename} exists, reading from file')
        df = pd.read_csv(base_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:
            df_temp = run_querry_day_ahead_prices(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp])

        df.columns = [f'Price_{country_code}']
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df.to_csv(base_df_filename)
    return df

In [11]:
def load_15min_to_hourly(df, col_name, target_col_name):
    df['Date'] = df.index.floor('h')
    df[target_col_name] = df.groupby('Date')[col_name].transform('mean')
    df2 = df[['Date', target_col_name]]
    df2 = df2.drop_duplicates()
    return df

In [12]:
def get_base_load_avg(country_code, years=years):
    base_load_df_filename = get_base_load_df_filename(country_code, years)
    if os.path.exists(base_load_df_filename):
        print(f'{base_load_df_filename} exists, reading from file')
        df = pd.read_csv(base_load_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_load_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:

            df_temp = run_querry_load(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp])
            
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df = load_15min_to_hourly(df, 'Actual Load', 'Load_avg')
        df = df[['Date', 'Load_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.columns = [f'Load_avg_{country_code}']
        df.to_csv(base_load_df_filename)
    return df

In [13]:
def get_base_wind_solar_forecast_avg(country_code, years=years):
    base_wind_solar_forecast_df_filename = get_base_wind_solar_forecast_df_filename(country_code, years)
    if os.path.exists(base_wind_solar_forecast_df_filename):
        print(f'{base_wind_solar_forecast_df_filename} exists, reading from file')
        df = pd.read_csv(base_wind_solar_forecast_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_wind_solar_forecast_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:

            df_temp = run_querry_wind_solar_forecast(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp])
            
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        #add 1 hour to the datetime
        df.index = df.index + pd.DateOffset(hours=1)
        df = load_15min_to_hourly(df, 'Solar', 'Solar_Fcast_avg')
        df = load_15min_to_hourly(df, 'Wind Onshore', 'Wind_Onshore_avg')

        df = df[['Date', 'Solar_Fcast_avg', 'Wind_Onshore_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.columns = [f'Solar_Fcast_avg_{country_code}', f'Wind_Onshore_avg_{country_code}']
        df.to_csv(base_wind_solar_forecast_df_filename)
    return df

In [14]:
def get_base_df(country_codes):
    for country_code in country_codes:
        print('Starting to collect data for', country_code)
        prices_df = get_base_prices(country_code)
        load_avg_df = get_base_load_avg(country_code)
        wind_solar_forecast_df = get_base_wind_solar_forecast_avg(country_code)
        
        df = pd.concat([prices_df, load_avg_df, wind_solar_forecast_df], axis=1)
        df = df.dropna()
        df = df.reset_index()
        df = df.rename(columns={'index': 'Datetime'})
        df_final = df if country_code == country_codes[0] else pd.merge(df_final, df, on='Datetime', how='inner')

    return df_final

In [15]:
country_codes = ['HU', 'RO', 'AT']
df = get_base_df(country_codes)

Starting to collect data for HU
./data/base/base_price_HU_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base/base_load_avgs_HU_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base/base_wind_solar_forecast_avgs_HU_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
Starting to collect data for RO
./data/base/base_price_RO_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base/base_load_avgs_RO_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base/base_wind_solar_forecast_avgs_RO_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
Starting to collect data for AT
./data/base/base_price_AT_('2018-01-01', '2018-12-31')_('2024-01-01', '2024-03-24').csv exists, reading from file
./data/base/base_load_avgs_AT_('2018-01-01', '2018-12-31')_(

In [16]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT
0,2019-01-01 00:00:00+00:00,59.90,4082.00,0.0,19.75,279.00,6110.00,0.0,88.00,33.48,6075.00,0.0,487.0
1,2019-01-01 01:00:00+00:00,52.71,3985.75,0.0,27.25,245.54,5856.00,0.0,95.00,39.76,5852.75,0.0,398.0
2,2019-01-01 02:00:00+00:00,36.49,3732.50,0.0,19.75,169.98,5675.00,0.0,90.00,39.78,5619.25,0.0,338.0
3,2019-01-01 03:00:00+00:00,31.24,3554.25,0.0,7.25,145.52,5570.00,0.0,86.00,27.87,5324.00,0.0,309.0
4,2019-01-01 04:00:00+00:00,25.98,3499.25,0.0,1.75,121.00,5524.00,0.0,83.00,-0.36,5273.50,0.0,320.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45654,2024-03-24 19:00:00+00:00,85.22,5588.50,0.0,63.25,85.46,6560.25,1.0,1105.25,80.25,6573.75,0.0,1421.0
45655,2024-03-24 20:00:00+00:00,76.51,5461.00,0.0,65.75,76.65,6317.25,1.0,1016.75,72.20,6335.75,0.0,1618.0
45656,2024-03-24 21:00:00+00:00,73.74,5193.50,0.0,68.00,73.68,5877.25,1.0,928.25,69.52,6060.25,0.0,1760.0
45657,2024-03-24 22:00:00+00:00,70.07,4897.25,0.0,71.75,70.10,5498.75,1.0,938.75,67.32,5977.25,0.0,1823.0


## Read in weather data

In [17]:
#create a filenames list for the weather data
weather_filenames = os.listdir(weather_folder_path)

#concat the weather data
for weather_filename in weather_filenames:
    if weather_filename == weather_filenames[0]:
        df_weather = pd.read_csv(f'{weather_folder_path}/{weather_filename}', index_col=0)
    else:
        df_temp = pd.read_csv(f'{weather_folder_path}/{weather_filename}', index_col=0)
        df_weather = pd.concat([df_weather, df_temp])


In [18]:
df_weather['Datetime'] = pd.to_datetime(df_weather.index)
df_weather.reset_index(drop=True, inplace=True)

In [19]:
df_weather

Unnamed: 0,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,Datetime
0,-3.579,95.965530,1029.6,1014.72410,0.0,0.0,100.000000,96.0,0.0,94.0,0.0,0.0,0.0,0.0,6.379216,9.178235,286.38962,295.55990,12.599999,-5.684500,85.782040,1025.1,1017.66900,0.0,0.0,0.900000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.055353,31.154043,289.65390,292.41620,27.720000,2017-01-01 00:00:00+00:00
1,-4.429,95.937294,1029.4,1014.48020,0.0,0.0,65.400000,44.0,0.0,86.0,0.0,0.0,0.0,0.0,5.091168,13.104198,315.00010,307.18478,10.080000,-4.734500,82.648636,1024.5,1017.09970,0.0,0.0,22.800001,0.0,38.0,0.0,0.0,0.0,0.0,0.0,17.317460,31.883888,290.69553,295.40768,29.519999,2017-01-01 01:00:00+00:00
2,-6.079,95.514830,1028.7,1013.69916,0.0,0.0,61.800000,41.0,0.0,83.0,0.0,0.0,0.0,0.0,7.200000,11.720751,323.13000,317.48960,10.799999,-4.734500,81.698490,1023.7,1016.30540,0.0,0.0,16.800001,0.0,28.0,0.0,0.0,0.0,0.0,0.0,17.992796,32.862694,289.88525,294.59976,30.960001,2017-01-01 02:00:00+00:00
3,-5.729,95.893540,1028.3,1013.32450,0.0,0.0,54.600000,32.0,0.0,86.0,0.0,0.0,0.0,0.0,5.860375,9.957108,317.48960,319.39877,10.799999,-4.634500,80.461500,1023.2,1015.81180,0.0,0.0,18.000000,0.0,30.0,0.0,0.0,0.0,0.0,0.0,18.899143,33.759480,287.74475,291.91492,32.399998,2017-01-01 03:00:00+00:00
4,-5.279,99.244804,1027.9,1012.95520,0.0,0.0,44.100000,47.0,0.0,6.0,0.0,0.0,0.0,0.0,2.741678,6.792466,336.80140,327.99466,9.000000,-4.684500,79.834880,1023.0,1015.61170,0.0,0.0,13.200001,0.0,22.0,0.0,0.0,0.0,0.0,0.0,19.211996,33.931790,282.99463,287.28160,32.760002,2017-01-01 04:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63355,5.421,80.713320,1007.2,993.11400,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,18.079027,251.56496,257.34744,12.959999,10.815499,83.978160,1002.5,995.65356,0.0,0.0,22.200000,0.0,1.0,72.0,0.0,0.0,0.0,0.0,11.304229,26.987997,217.23491,223.91916,20.519999,2024-03-24 19:00:00+00:00
63356,4.171,88.378944,1007.1,992.95240,0.0,0.0,1.800000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,19.353140,251.56496,260.36252,12.240000,10.465500,83.084564,1001.9,995.04920,0.0,0.0,51.600002,0.0,36.0,100.0,0.0,0.0,0.0,0.0,10.799999,26.220753,216.86998,224.44382,18.359999,2024-03-24 20:00:00+00:00
63357,3.721,90.892494,1007.0,992.83105,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.072878,19.813087,255.25640,267.91748,12.959999,10.965500,78.470280,1001.3,994.46515,0.0,0.0,19.800001,0.0,33.0,0.0,0.0,0.0,0.0,0.0,12.313894,28.284640,217.87505,222.42080,20.160000,2024-03-24 21:00:00+00:00
63358,3.471,90.874850,1006.8,992.62103,0.0,0.0,11.400001,0.0,11.0,16.0,0.0,0.0,0.0,0.0,8.350138,22.505037,262.56870,277.35230,14.400000,10.815499,76.852480,1001.0,994.16376,0.0,0.0,28.500000,0.0,47.0,1.0,0.0,0.0,0.0,0.0,12.496719,28.817993,258.36640,257.00537,20.519999,2024-03-24 22:00:00+00:00


In [20]:
df_weather.isna().sum().sum()

0

In [21]:
df = pd.merge(df, df_weather, on='Datetime', how='inner')

In [22]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca
0,2019-01-01 00:00:00+00:00,59.90,4082.00,0.0,19.75,279.00,6110.00,0.0,88.00,33.48,6075.00,0.0,487.0,-1.579,89.836760,1031.0,1016.21246,0.0,0.0,17.100000,5.0,1.0,40.0,0.0,0.0,0.0,0.0,6.109403,10.895577,315.00010,352.40543,11.159999,0.215500,93.328880,1027.5,1020.21160,0.0,0.0,14.400000,12.0,6.0,0.0,0.0,0.0,0.0,0.0,10.105681,20.326454,355.914460,22.932130,16.199999
1,2019-01-01 01:00:00+00:00,52.71,3985.75,0.0,27.25,245.54,5856.00,0.0,95.00,39.76,5852.75,0.0,398.0,-1.579,89.836760,1031.1,1016.31100,0.0,0.0,5.400000,5.0,0.0,3.0,0.0,0.0,0.0,0.0,5.991594,10.315115,302.73520,330.75128,11.879999,0.665500,94.718620,1028.1,1020.81920,0.0,0.0,29.100000,21.0,17.0,0.0,0.0,0.0,0.0,0.0,11.212135,22.862123,5.527458,26.161512,19.440000
2,2019-01-01 02:00:00+00:00,36.49,3732.50,0.0,19.75,169.98,5675.00,0.0,90.00,39.78,5619.25,0.0,338.0,-1.879,90.484474,1031.2,1016.39350,0.0,0.0,7.800000,8.0,1.0,0.0,0.0,0.0,0.0,0.0,6.120000,10.195057,298.07240,317.86246,10.799999,0.965500,93.707565,1027.9,1020.62880,0.0,0.0,17.400000,14.0,8.0,0.0,0.0,0.0,0.0,0.0,10.041354,21.370783,14.534496,32.619240,19.080000
3,2019-01-01 03:00:00+00:00,31.24,3554.25,0.0,7.25,145.52,5570.00,0.0,86.00,27.87,5324.00,0.0,309.0,-1.779,89.820750,1030.7,1015.90607,0.0,0.0,5.400000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,8.217153,17.826363,298.81070,313.36346,15.119999,0.315500,93.674934,1027.6,1020.31354,0.0,0.0,26.099998,27.0,3.0,0.0,0.0,0.0,0.0,0.0,7.342588,18.218275,11.309895,37.775745,16.919998
4,2019-01-01 04:00:00+00:00,25.98,3499.25,0.0,1.75,121.00,5524.00,0.0,83.00,-0.36,5273.50,0.0,320.0,-2.279,90.116875,1030.1,1015.28766,0.0,0.0,29.700000,1.0,0.0,96.0,0.0,0.0,0.0,0.0,6.439876,15.546833,296.56497,312.18436,15.119999,-0.134500,93.995240,1027.7,1020.40100,0.0,0.0,42.900000,45.0,4.0,0.0,0.0,0.0,0.0,0.0,7.421590,15.021105,14.036275,45.970932,11.879999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45654,2024-03-24 19:00:00+00:00,85.22,5588.50,0.0,63.25,85.46,6560.25,1.0,1105.25,80.25,6573.75,0.0,1421.0,5.421,80.713320,1007.2,993.11400,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,18.079027,251.56496,257.34744,12.959999,10.815499,83.978160,1002.5,995.65356,0.0,0.0,22.200000,0.0,1.0,72.0,0.0,0.0,0.0,0.0,11.304229,26.987997,217.234910,223.919160,20.519999
45655,2024-03-24 20:00:00+00:00,76.51,5461.00,0.0,65.75,76.65,6317.25,1.0,1016.75,72.20,6335.75,0.0,1618.0,4.171,88.378944,1007.1,992.95240,0.0,0.0,1.800000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,19.353140,251.56496,260.36252,12.240000,10.465500,83.084564,1001.9,995.04920,0.0,0.0,51.600002,0.0,36.0,100.0,0.0,0.0,0.0,0.0,10.799999,26.220753,216.869980,224.443820,18.359999
45656,2024-03-24 21:00:00+00:00,73.74,5193.50,0.0,68.00,73.68,5877.25,1.0,928.25,69.52,6060.25,0.0,1760.0,3.721,90.892494,1007.0,992.83105,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.072878,19.813087,255.25640,267.91748,12.959999,10.965500,78.470280,1001.3,994.46515,0.0,0.0,19.800001,0.0,33.0,0.0,0.0,0.0,0.0,0.0,12.313894,28.284640,217.875050,222.420800,20.160000
45657,2024-03-24 22:00:00+00:00,70.07,4897.25,0.0,71.75,70.10,5498.75,1.0,938.75,67.32,5977.25,0.0,1823.0,3.471,90.874850,1006.8,992.62103,0.0,0.0,11.400001,0.0,11.0,16.0,0.0,0.0,0.0,0.0,8.350138,22.505037,262.56870,277.35230,14.400000,10.815499,76.852480,1001.0,994.16376,0.0,0.0,28.500000,0.0,47.0,1.0,0.0,0.0,0.0,0.0,12.496719,28.817993,258.366400,257.005370,20.519999


In [23]:
df['Datetime'].duplicated().sum()

0

# Create more input features

In [24]:
target_country_code = 'HU'

In [25]:
df['year'] = df['Datetime'].dt.year
df['month'] = df['Datetime'].dt.month
df['day'] = df['Datetime'].dt.day
df['hour'] = df['Datetime'].dt.hour
df['weekday'] = df['Datetime'].dt.weekday
df['dayoftheweek'] = df['Datetime'].dt.dayofweek
df['date'] = df['Datetime'].dt.date

In [26]:
month_dummies = pd.get_dummies(df['month'], prefix='month', drop_first=True)
dayofweek_dummies = pd.get_dummies(df['dayoftheweek'], prefix='dayofweek', drop_first=True)
df = pd.concat([df, month_dummies, dayofweek_dummies], axis=1)

In [27]:
dummy_columns = month_dummies.columns.tolist() + dayofweek_dummies.columns.tolist()

In [28]:
df['Datetime-2d'] = df['Datetime'] + pd.to_timedelta(-2, unit='day')
df['Datetime-7d'] = df['Datetime'] + pd.to_timedelta(-7, unit='day')

for country in country_codes:
    load_map = df.set_index('Datetime')[f'Price_{country}'].to_dict()

    df[f'Price_2d_{country}'] = df['Datetime-2d'].map(load_map)
    df[f'Price_7d_{country}'] = df['Datetime-7d'].map(load_map)


In [30]:
df.isnull().sum()

Datetime                 0
Price_HU                 0
Load_avg_HU              0
Solar_Fcast_avg_HU       0
Wind_Onshore_avg_HU      0
                      ... 
Price_7d_HU            349
Price_2d_RO            216
Price_7d_RO            349
Price_2d_AT            216
Price_7d_AT            349
Length: 83, dtype: int64

In [31]:
df = df.dropna().reset_index(drop=True)

In [32]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,year,month,day,hour,weekday,dayoftheweek,date,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,Datetime-2d,Datetime-7d,Price_2d_HU,Price_7d_HU,Price_2d_RO,Price_7d_RO,Price_2d_AT,Price_7d_AT
0,2019-01-08 00:00:00+00:00,51.40,5069.50,0.0,22.50,181.78,6959.00,0.0,1456.00,42.02,7071.75,0.0,438.0,-8.129001,87.208970,1026.9,1011.81040,0.0,0.0,67.800000,30.0,68.0,0.0,0.0,0.0,0.0,0.0,7.594208,12.738099,211.42952,222.70937,9.360000,-4.884500,75.288025,1024.3,1016.89690,0.0,0.0,46.200000,17.0,51.0,1.0,0.0,0.0,0.0,0.0,17.068707,28.412645,332.35410,337.65930,34.200000,2019,1,8,0,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 00:00:00+00:00,2019-01-01 00:00:00+00:00,50.81,59.90,236.99,279.00,52.46,33.48
1,2019-01-08 01:00:00+00:00,42.99,4876.25,0.0,31.75,170.46,6874.00,0.0,1397.00,38.06,6868.75,0.0,464.0,-7.129000,85.619900,1026.3,1011.27545,0.0,0.0,100.000000,80.0,94.0,0.0,0.0,0.0,0.0,0.0,4.510787,12.768586,208.61037,229.57400,11.159999,-5.434500,76.075905,1024.3,1016.88190,0.0,0.0,74.700005,11.0,89.0,38.0,0.0,0.0,0.0,0.0,17.595861,29.625180,329.23734,334.05774,30.960001,2019,1,8,1,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 01:00:00+00:00,2019-01-01 01:00:00+00:00,47.95,52.71,223.69,245.54,49.02,39.76
2,2019-01-08 02:00:00+00:00,41.00,4672.50,0.0,32.00,161.23,6860.00,0.0,1347.00,39.05,6744.75,0.0,490.0,-5.929000,78.102620,1025.6,1010.65260,0.1,0.0,100.000000,75.0,100.0,0.0,0.0,0.0,0.0,0.0,4.379589,8.913181,189.46225,223.36346,12.240000,-5.384500,74.319214,1023.9,1016.48610,0.0,0.0,88.200005,0.0,98.0,98.0,0.0,0.0,0.0,0.0,18.218275,29.469470,330.39563,334.68716,32.760002,2019,1,8,2,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 02:00:00+00:00,2019-01-01 02:00:00+00:00,46.07,36.49,214.92,169.98,49.01,39.78
3,2019-01-08 03:00:00+00:00,39.76,4583.00,0.0,36.50,185.47,6923.00,0.0,1221.00,22.65,6597.00,0.0,536.0,-5.579000,77.855380,1024.7,1009.78503,0.2,0.0,100.000000,77.0,99.0,0.0,0.0,0.0,0.0,0.0,5.091168,8.225035,188.13002,203.19861,11.520000,-5.384500,73.450210,1023.3,1015.89040,0.0,0.0,90.900000,4.0,96.0,99.0,0.0,0.0,0.0,0.0,18.218275,28.979440,330.39563,333.43503,33.120000,2019,1,8,3,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 03:00:00+00:00,2019-01-01 03:00:00+00:00,42.86,31.24,199.93,145.52,48.02,27.87
4,2019-01-08 04:00:00+00:00,42.77,4685.75,0.0,49.75,199.50,7133.00,0.0,1048.00,38.05,6693.75,0.0,610.0,-5.429000,78.184370,1023.7,1008.80790,0.2,0.0,100.000000,89.0,89.0,1.0,0.0,0.0,0.0,0.0,5.771239,8.350138,176.42374,187.43132,11.879999,-5.734500,73.962290,1023.1,1015.68230,0.0,0.0,88.200005,0.0,97.0,100.0,0.0,0.0,0.0,0.0,16.375053,28.043050,326.65927,330.80260,32.399998,2019,1,8,4,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 04:00:00+00:00,2019-01-01 04:00:00+00:00,43.01,25.98,200.61,121.00,48.30,-0.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45141,2024-03-24 19:00:00+00:00,85.22,5588.50,0.0,63.25,85.46,6560.25,1.0,1105.25,80.25,6573.75,0.0,1421.0,5.421000,80.713320,1007.2,993.11400,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,18.079027,251.56496,257.34744,12.959999,10.815499,83.978160,1002.5,995.65356,0.0,0.0,22.200000,0.0,1.0,72.0,0.0,0.0,0.0,0.0,11.304229,26.987997,217.23491,223.91916,20.519999,2024,3,24,19,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 19:00:00+00:00,2024-03-17 19:00:00+00:00,114.84,87.46,114.37,87.37,113.47,87.19
45142,2024-03-24 20:00:00+00:00,76.51,5461.00,0.0,65.75,76.65,6317.25,1.0,1016.75,72.20,6335.75,0.0,1618.0,4.171000,88.378944,1007.1,992.95240,0.0,0.0,1.800000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,19.353140,251.56496,260.36252,12.240000,10.465500,83.084564,1001.9,995.04920,0.0,0.0,51.600002,0.0,36.0,100.0,0.0,0.0,0.0,0.0,10.799999,26.220753,216.86998,224.44382,18.359999,2024,3,24,20,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 20:00:00+00:00,2024-03-17 20:00:00+00:00,90.77,77.89,90.34,77.89,88.16,77.89
45143,2024-03-24 21:00:00+00:00,73.74,5193.50,0.0,68.00,73.68,5877.25,1.0,928.25,69.52,6060.25,0.0,1760.0,3.721000,90.892494,1007.0,992.83105,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.072878,19.813087,255.25640,267.91748,12.959999,10.965500,78.470280,1001.3,994.46515,0.0,0.0,19.800001,0.0,33.0,0.0,0.0,0.0,0.0,0.0,12.313894,28.284640,217.87505,222.42080,20.160000,2024,3,24,21,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 21:00:00+00:00,2024-03-17 21:00:00+00:00,78.11,72.33,77.62,72.33,75.46,72.33
45144,2024-03-24 22:00:00+00:00,70.07,4897.25,0.0,71.75,70.10,5498.75,1.0,938.75,67.32,5977.25,0.0,1823.0,3.471000,90.874850,1006.8,992.62103,0.0,0.0,11.400001,0.0,11.0,16.0,0.0,0.0,0.0,0.0,8.350138,22.505037,262.56870,277.35230,14.400000,10.815499,76.852480,1001.0,994.16376,0.0,0.0,28.500000,0.0,47.0,1.0,0.0,0.0,0.0,0.0,12.496719,28.817993,258.36640,257.00537,20.519999,2024,3,24,22,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 22:00:00+00:00,2024-03-17 22:00:00+00:00,75.69,72.10,75.14,72.10,74.76,72.10


In [33]:
from workalendar.europe import Hungary, Romania, Austria

cal_hu = Hungary()
cal_ro = Romania()
cal_at = Austria()
year_list = df['Datetime'].dt.year.unique()

In [34]:
holiday_df = pd.DataFrame()
for cal, cntry in [(cal_hu, 'HU'), (cal_ro, 'RO'), (cal_at, 'AT')]:
    for year in year_list:
        holidays = cal.holidays(year)
        temp_df = pd.DataFrame(holidays, columns=['date', f'holiday_name_{cntry}'])
        holiday_df = pd.concat([holiday_df, temp_df],
                            axis=0).reset_index(drop=True)

In [35]:
holiday_df

Unnamed: 0,date,holiday_name_HU,holiday_name_RO,holiday_name_AT
0,2019-01-01,New year,,
1,2019-03-15,National Day,,
2,2019-04-19,Good Friday,,
3,2019-04-21,Easter Sunday,,
4,2019-04-22,Easter Monday,,
...,...,...,...,...
241,2024-10-26,,,National Holiday
242,2024-11-01,,,All Saints Day
243,2024-12-08,,,Immaculate Conception
244,2024-12-25,,,Christmas Day


In [36]:
holiday_df.set_index('date', inplace=True)

for col in holiday_df.columns:
    holiday_map = holiday_df[col].to_dict()
    df[col] = df['date'].map(holiday_map)
    cntry = col.split('_')[-1]
    df[f'is_holiday_{cntry}'] = 0
    df.loc[df[f'is_holiday_{cntry}'].notnull(), f'is_holiday_{cntry}'] = 1

    df.drop(columns=[col], inplace=True)

In [37]:
for col in holiday_df.columns:
    cntry = col.split('_')[-1]
    df[f'is_workday_{cntry}'] = 1
    df.loc[df[f'is_holiday_{cntry}'] == 1, f'is_workday_{cntry}'] = 0
    df.loc[df['dayoftheweek'].isin([5, 6]), f'is_workday_{cntry}'] = 0

In [38]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,year,month,day,hour,weekday,dayoftheweek,date,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,Datetime-2d,Datetime-7d,Price_2d_HU,Price_7d_HU,Price_2d_RO,Price_7d_RO,Price_2d_AT,Price_7d_AT,is_holiday_HU,is_holiday_RO,is_holiday_AT,is_workday_HU,is_workday_RO,is_workday_AT
0,2019-01-08 00:00:00+00:00,51.40,5069.50,0.0,22.50,181.78,6959.00,0.0,1456.00,42.02,7071.75,0.0,438.0,-8.129001,87.208970,1026.9,1011.81040,0.0,0.0,67.800000,30.0,68.0,0.0,0.0,0.0,0.0,0.0,7.594208,12.738099,211.42952,222.70937,9.360000,-4.884500,75.288025,1024.3,1016.89690,0.0,0.0,46.200000,17.0,51.0,1.0,0.0,0.0,0.0,0.0,17.068707,28.412645,332.35410,337.65930,34.200000,2019,1,8,0,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 00:00:00+00:00,2019-01-01 00:00:00+00:00,50.81,59.90,236.99,279.00,52.46,33.48,1,1,1,0,0,0
1,2019-01-08 01:00:00+00:00,42.99,4876.25,0.0,31.75,170.46,6874.00,0.0,1397.00,38.06,6868.75,0.0,464.0,-7.129000,85.619900,1026.3,1011.27545,0.0,0.0,100.000000,80.0,94.0,0.0,0.0,0.0,0.0,0.0,4.510787,12.768586,208.61037,229.57400,11.159999,-5.434500,76.075905,1024.3,1016.88190,0.0,0.0,74.700005,11.0,89.0,38.0,0.0,0.0,0.0,0.0,17.595861,29.625180,329.23734,334.05774,30.960001,2019,1,8,1,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 01:00:00+00:00,2019-01-01 01:00:00+00:00,47.95,52.71,223.69,245.54,49.02,39.76,1,1,1,0,0,0
2,2019-01-08 02:00:00+00:00,41.00,4672.50,0.0,32.00,161.23,6860.00,0.0,1347.00,39.05,6744.75,0.0,490.0,-5.929000,78.102620,1025.6,1010.65260,0.1,0.0,100.000000,75.0,100.0,0.0,0.0,0.0,0.0,0.0,4.379589,8.913181,189.46225,223.36346,12.240000,-5.384500,74.319214,1023.9,1016.48610,0.0,0.0,88.200005,0.0,98.0,98.0,0.0,0.0,0.0,0.0,18.218275,29.469470,330.39563,334.68716,32.760002,2019,1,8,2,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 02:00:00+00:00,2019-01-01 02:00:00+00:00,46.07,36.49,214.92,169.98,49.01,39.78,1,1,1,0,0,0
3,2019-01-08 03:00:00+00:00,39.76,4583.00,0.0,36.50,185.47,6923.00,0.0,1221.00,22.65,6597.00,0.0,536.0,-5.579000,77.855380,1024.7,1009.78503,0.2,0.0,100.000000,77.0,99.0,0.0,0.0,0.0,0.0,0.0,5.091168,8.225035,188.13002,203.19861,11.520000,-5.384500,73.450210,1023.3,1015.89040,0.0,0.0,90.900000,4.0,96.0,99.0,0.0,0.0,0.0,0.0,18.218275,28.979440,330.39563,333.43503,33.120000,2019,1,8,3,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 03:00:00+00:00,2019-01-01 03:00:00+00:00,42.86,31.24,199.93,145.52,48.02,27.87,1,1,1,0,0,0
4,2019-01-08 04:00:00+00:00,42.77,4685.75,0.0,49.75,199.50,7133.00,0.0,1048.00,38.05,6693.75,0.0,610.0,-5.429000,78.184370,1023.7,1008.80790,0.2,0.0,100.000000,89.0,89.0,1.0,0.0,0.0,0.0,0.0,5.771239,8.350138,176.42374,187.43132,11.879999,-5.734500,73.962290,1023.1,1015.68230,0.0,0.0,88.200005,0.0,97.0,100.0,0.0,0.0,0.0,0.0,16.375053,28.043050,326.65927,330.80260,32.399998,2019,1,8,4,1,1,2019-01-08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2019-01-06 04:00:00+00:00,2019-01-01 04:00:00+00:00,43.01,25.98,200.61,121.00,48.30,-0.36,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45141,2024-03-24 19:00:00+00:00,85.22,5588.50,0.0,63.25,85.46,6560.25,1.0,1105.25,80.25,6573.75,0.0,1421.0,5.421000,80.713320,1007.2,993.11400,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,18.079027,251.56496,257.34744,12.959999,10.815499,83.978160,1002.5,995.65356,0.0,0.0,22.200000,0.0,1.0,72.0,0.0,0.0,0.0,0.0,11.304229,26.987997,217.23491,223.91916,20.519999,2024,3,24,19,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 19:00:00+00:00,2024-03-17 19:00:00+00:00,114.84,87.46,114.37,87.37,113.47,87.19,1,1,1,0,0,0
45142,2024-03-24 20:00:00+00:00,76.51,5461.00,0.0,65.75,76.65,6317.25,1.0,1016.75,72.20,6335.75,0.0,1618.0,4.171000,88.378944,1007.1,992.95240,0.0,0.0,1.800000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.830519,19.353140,251.56496,260.36252,12.240000,10.465500,83.084564,1001.9,995.04920,0.0,0.0,51.600002,0.0,36.0,100.0,0.0,0.0,0.0,0.0,10.799999,26.220753,216.86998,224.44382,18.359999,2024,3,24,20,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 20:00:00+00:00,2024-03-17 20:00:00+00:00,90.77,77.89,90.34,77.89,88.16,77.89,1,1,1,0,0,0
45143,2024-03-24 21:00:00+00:00,73.74,5193.50,0.0,68.00,73.68,5877.25,1.0,928.25,69.52,6060.25,0.0,1760.0,3.721000,90.892494,1007.0,992.83105,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.072878,19.813087,255.25640,267.91748,12.959999,10.965500,78.470280,1001.3,994.46515,0.0,0.0,19.800001,0.0,33.0,0.0,0.0,0.0,0.0,0.0,12.313894,28.284640,217.87505,222.42080,20.160000,2024,3,24,21,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 21:00:00+00:00,2024-03-17 21:00:00+00:00,78.11,72.33,77.62,72.33,75.46,72.33,1,1,1,0,0,0
45144,2024-03-24 22:00:00+00:00,70.07,4897.25,0.0,71.75,70.10,5498.75,1.0,938.75,67.32,5977.25,0.0,1823.0,3.471000,90.874850,1006.8,992.62103,0.0,0.0,11.400001,0.0,11.0,16.0,0.0,0.0,0.0,0.0,8.350138,22.505037,262.56870,277.35230,14.400000,10.815499,76.852480,1001.0,994.16376,0.0,0.0,28.500000,0.0,47.0,1.0,0.0,0.0,0.0,0.0,12.496719,28.817993,258.36640,257.00537,20.519999,2024,3,24,22,6,6,2024-03-24,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2024-03-22 22:00:00+00:00,2024-03-17 22:00:00+00:00,75.69,72.10,75.14,72.10,74.76,72.10,1,1,1,0,0,0


# Save the whole dataset

In [39]:
df.to_csv(f'./data/all_data.csv')

## Baseline

The baseline model is just shifting the prices by 1 week

In [61]:
def baseline_model(df):
    y_pred = df[f'Price_{target_country_code}'].shift(-7*24)
    return y_pred

In [None]:
base_pred = baseline_model(df)
start_date = '2023-01-01'
end_date = '2024-03-17'
base_eval(df.loc[df['Datetime'] >= test_end_date, target_col], base_pred.loc[df['Datetime'] >= test_end_date])
weighted_eval(df.loc[df['Datetime'] >= test_end_date, target_col], base_pred.loc[df['Datetime'] >= test_end_date], df.loc[df['Datetime'] >= test_end_date, f'Load_avg_{target_country_code}'])

In [None]:
plot_predictions(df.loc[df['Datetime'] >= test_end_date, target_col].to_list(), base_pred.loc[df['Datetime'] >= test_end_date].to_list(), 'Baseline model')

## Baseline v2

The predicted value is based on the most similar hour in the history based on the input columns.

In [46]:
from sklearn.neighbors import NearestNeighbors

In [None]:
base2_cols = df.columns.tolist()
base2_cols = [col for col in base2_cols if col not in do_not_use_cols]
base2_cols.remove('Price_2d_HU')
base2_cols.remove('Price_7d_HU')
base2_cols

In [48]:
K = 15
METRIC = "cosine"

def get_neighbors(index):
    curr_date = df.loc[index, 'date']
    X = df.loc[df['date'] < curr_date, base2_cols].values
    knn = NearestNeighbors(n_neighbors=K, metric=METRIC).fit(X)
    
    _, indices = knn.kneighbors(df.loc[index, base2_cols].values.reshape(1, -1))
    indices = indices[0].tolist()
    return [i for i in indices if df.loc[i, 'is_workday'] == df.loc[index, 'is_workday']]

In [49]:
def baseline_model2(df, date, target_col=target_col):
    datetimes = df.loc[df['date'] >= date, 'Datetime']
    y_pred = []
    for dt in datetimes:
        idx = df[df['Datetime'] == dt].index[0]
        neighbors = get_neighbors(idx)
        y_pred.append(df.loc[neighbors, target_col].mean())
    return y_pred

In [None]:
test_end_date_base2 = pd.Timestamp(test_end_date).date()
base2_pred = baseline_model2(df, test_end_date_base2)
base2_true = df.loc[df['date'] >= test_end_date_base2, target_col]
base_eval(base2_true, base2_pred)
weighted_eval(base2_true, base2_pred, df.loc[df['date'] >= test_end_date_base2, f'Load_avg_{target_country_code}'])

In [None]:
plot_predictions(base2_true.to_list(), base2_pred, 'Baseline model v2')

## Recurrent Neural Network

In [74]:
#recurrent neural network
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Input
import tensorflow as tf

def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 24

X_train_lstm, y_train_lstm = create_dataset(X_train, y_train, time_steps)
X_test_lstm, y_test_lstm = create_dataset(X_test, y_test, time_steps)

In [75]:
X_train_lstm = X_train_lstm.astype('float32')
y_train_lstm = y_train_lstm.astype('float32')
X_test_lstm = X_test_lstm.astype('float32')
y_test_lstm = y_test_lstm.astype('float32')

In [76]:
lstm = Sequential()
lstm.add(Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm.add(LSTM(units=64))
lstm.add(Dropout(rate=0.2))
lstm.add(Dense(units=1))
lstm.compile(loss='mean_squared_error', optimizer='adam')

In [77]:
X_train_lstm = tf.convert_to_tensor(X_train_lstm)
X_test_lstm = tf.convert_to_tensor(X_test_lstm)
y_train_lstm = tf.convert_to_tensor(y_train_lstm)
y_test_lstm = tf.convert_to_tensor(y_test_lstm)

In [None]:
history = lstm.fit(
    X_train_lstm, y_train_lstm,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    shuffle=False
)

In [None]:
lstm_pred = lstm.predict(X_test_lstm)

lstm_pred = lstm_pred.reshape(lstm_pred.shape[0])

In [None]:
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Loss')
plt.legend()

# Notes

Ma 8as adatokkal 9ig a holnapit (a mait mind ismerem)
- baseline1 az 1 heti adat
- baseline2 hasonló időjárású nap

Kiértékelés:
- v1 abs hiba (hány eurót tévedtünk)
- v2 adott órában mennyi a load (termelés / fogyasztás), hiba súlyozva a teljes fogyasztással

Opciók
- recurrent nn predictor (?)
- gbm regressor
-- (előzö napi adatok, napelemek termelése, román adatok, hőmérséklet..., körny ország árai)
-- walk forward opt

keretrendszer
feture inportance alapján feature selection 
- változásuk követése !!!

(talán osztrák is számít, meg kell nézni melyik számít)

időjárási adatok (első körben tényadatok, nem előrejelzés) próbálkozni kell, drága lehet, kb kizárt 

- végén fontos és ***nem fontos*** változók listája

3 fontos időjárás (régiós, a napi bontás is jó)
- hány fok van (fűtés / hűtés)
- besugárzás
- szélerősség


-Hányszor volt negatív ár - statisztika róla (Meg tudjuk-e mondani, hogy mikor lesz negatív ár)
-- Ez is lehet célváltozó és kiértékelés

- Napi egy órát kikapcsoljuk, cél: mikor legyen (mert a többi órában többet tudunk termelni)
-- Meg lehet nézni, hogy melyik lesz a legdrágább óra
