In [65]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from entsoe import EntsoePandasClient

In [66]:
pd.set_option('display.max_columns', None)

In [67]:
my_api_key = os.environ.get('ENTSOE_API_KEY')
client = EntsoePandasClient(api_key=my_api_key)

# Parameters of the dataset

In [68]:
country_code = 'HU'

In [69]:
years = []

for y in range(2019, 2020):
    for m in range (1, 13, 3):
        start_date = f'{y}-{m:02d}-01'
        if m+2 == 6 or m+2 == 9:
            end_date = f'{y}-{m+2:02d}-30'
        else:
            end_date = f'{y}-{m+2:02d}-31'
        years.append((start_date, end_date))

In [70]:
years

[('2019-01-01', '2019-03-31'),
 ('2019-04-01', '2019-06-30'),
 ('2019-07-01', '2019-09-30'),
 ('2019-10-01', '2019-12-31')]

In [71]:
prices_folder_path = './data/prices'
loads_folder_path = './data/loads'
wind_solar_forecast_folder_path = './data/wind_solar_forecast'
weather_folder_path = './data/weather'
base_folder_path = './data/base'

In [72]:
def get_base_df_filename(country_code, years):
    return f'{base_folder_path}/base_price_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_load_df_filename(country_code, years):
    return f'{base_folder_path}/base_load_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_wind_solar_forecast_df_filename(country_code, years):
    return f'{base_folder_path}/base_wind_solar_forecast_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

def get_base_weather_df_filename(country_code, years):
    return f'{base_folder_path}/base_weather_avgs_{country_code}_{years[0]}_{years[-1]}.csv'

In [73]:
def run_querry_day_ahead_prices(country_code, start_date, end_date):
    filename = f'price_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    end_ts = end_ts + pd.Timedelta(hours=23, minutes=59, seconds=59)

    if os.path.exists(f'{prices_folder_path}/{filename}'):
        print(f'{prices_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{prices_folder_path}/{filename}', index_col=0)
    else:
        print(f'{prices_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        df = client.query_day_ahead_prices(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{prices_folder_path}/{filename}')

    return df

In [74]:
def run_querry_load(country_code, start_date, end_date):
    filename = f'load_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    end_ts = end_ts + pd.Timedelta(hours=23, minutes=59, seconds=59)


    if os.path.exists(f'{loads_folder_path}/{filename}'):
        print(f'{loads_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{loads_folder_path}/{filename}', index_col=0)
    else:
        print(f'{loads_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        df =client.query_load(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{loads_folder_path}/{filename}')

    return df

In [75]:
def run_querry_wind_solar_forecast(country_code, start_date, end_date):
    filename = f'wind_solar_forecast_{start_date}_{end_date}_{country_code}.csv'
    start_ts = pd.Timestamp(start_date, tz='Europe/Budapest')
    end_ts = pd.Timestamp(end_date, tz='Europe/Budapest')

    end_ts = end_ts + pd.Timedelta(hours=23, minutes=59, seconds=59)


    if os.path.exists(f'{wind_solar_forecast_folder_path}/{filename}'):
        print(f'{wind_solar_forecast_folder_path}/{filename} exists, reading from file')
        #load
        df = pd.read_csv(f'{wind_solar_forecast_folder_path}/{filename}', index_col=0)
    else:
        print(f'{wind_solar_forecast_folder_path}/{filename} does not exist, downloading from ENTSO-E')

        df =client.query_wind_and_solar_forecast(country_code, start=start_ts, end=end_ts)        # Data from ENTSO-E
        
        df.to_csv(f'{wind_solar_forecast_folder_path}/{filename}')

    return df

# Create the base concatenated df

First we will create a base concatenated dataframe with all the data from the different files but only the prices

In [76]:
def get_base_prices(country_code, years=years):
    base_df_filename = get_base_df_filename(country_code, years)
    if os.path.exists(base_df_filename):
        print(f'{base_df_filename} exists, reading from file')
        df = pd.read_csv(base_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:
            df_temp = run_querry_day_ahead_prices(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp], join='outer')
        
        #convert df to a DataFrame
        df = pd.DataFrame(df)

        if(df.columns.size == 1):
            df.columns = [f'Price_{country_code}']
        else:
            df.columns = ['Datetime', f'Price_{country_code}']
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        # Ensure df is a DataFrame
        if isinstance(df, pd.Series):
            df = df.to_frame()
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')

        df = df[~df.index.duplicated(keep='first')]
        
        df.to_csv(base_df_filename)
    return df

In [77]:
def load_15min_to_hourly(df, col_name, target_col_name):
    df['Date'] = pd.to_datetime(df.index, utc=True).floor('h')
    df[target_col_name] = df.groupby('Date')[col_name].transform('mean')
    df2 = df[['Date', target_col_name]]
    df2 = df2.drop_duplicates()
    return df

In [78]:
def get_base_load_avg(country_code, years=years):
    base_load_df_filename = get_base_load_df_filename(country_code, years)
    if os.path.exists(base_load_df_filename):
        print(f'{base_load_df_filename} exists, reading from file')
        df = pd.read_csv(base_load_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_load_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:

            df_temp = run_querry_load(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp])
            
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')

        df = df[~df.index.duplicated(keep='first')]

        df = load_15min_to_hourly(df, 'Actual Load', 'Load_avg')
        df = df[['Date', 'Load_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.columns = [f'Load_avg_{country_code}']
        df.to_csv(base_load_df_filename)
    return df

In [79]:
def get_base_wind_solar_forecast_avg(country_code, years=years):
    base_wind_solar_forecast_df_filename = get_base_wind_solar_forecast_df_filename(country_code, years)
    if os.path.exists(base_wind_solar_forecast_df_filename):
        print(f'{base_wind_solar_forecast_df_filename} exists, reading from file')
        df = pd.read_csv(base_wind_solar_forecast_df_filename, index_col=0, parse_dates=True)
    else:
        print(f'{base_wind_solar_forecast_df_filename} does not exist, concatenating from multiple files')

        for (start_date, end_date) in years:

            df_temp = run_querry_wind_solar_forecast(country_code, start_date, end_date)

            df = df_temp if (start_date, end_date) == years[0] else pd.concat([df, df_temp], join='outer')
            
        df['Datetime'] = df.index
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        df.reset_index(drop=True, inplace=True)
        #order by datetime
        df = df.sort_values(by='Datetime')
        df = df.set_index('Datetime')
        
        df = df[~df.index.duplicated(keep='first')]

        df = load_15min_to_hourly(df, 'Solar', 'Solar_Fcast_avg')
        df = load_15min_to_hourly(df, 'Wind Onshore', 'Wind_Onshore_avg')

        df = df[['Date', 'Solar_Fcast_avg', 'Wind_Onshore_avg']].drop_duplicates()
        df.drop(columns=['Date'], inplace=True)
        df.columns = [f'Solar_Fcast_avg_{country_code}', f'Wind_Onshore_avg_{country_code}']
        df.to_csv(base_wind_solar_forecast_df_filename)
    return df

In [80]:
def get_base_df(country_codes):
    for country_code in country_codes:
        print('Starting to collect data for', country_code)
        prices_df = get_base_prices(country_code)
        load_avg_df = get_base_load_avg(country_code)
        wind_solar_forecast_df = get_base_wind_solar_forecast_avg(country_code)
        
        df = pd.concat([prices_df, load_avg_df, wind_solar_forecast_df], axis=1, join='outer')
        df = df.dropna()
        df = df.reset_index()
        df = df.rename(columns={'index': 'Datetime'})
        df_final = df if country_code == country_codes[0] else pd.merge(df_final, df, on='Datetime', how='outer')

    return df_final

In [81]:
country_codes = ['HU', 'RO', 'AT']
df = get_base_df(country_codes)

Starting to collect data for HU
./data/base/base_price_HU_('2019-01-01', '2019-03-31')_('2019-10-01', '2019-12-31').csv does not exist, concatenating from multiple files
./data/prices/price_2019-01-01_2019-03-31_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2019-04-01_2019-06-30_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2019-07-01_2019-09-30_HU.csv does not exist, downloading from ENTSO-E
./data/prices/price_2019-10-01_2019-12-31_HU.csv does not exist, downloading from ENTSO-E
./data/base/base_load_avgs_HU_('2019-01-01', '2019-03-31')_('2019-10-01', '2019-12-31').csv does not exist, concatenating from multiple files
./data/loads/load_2019-01-01_2019-03-31_HU.csv exists, reading from file
./data/loads/load_2019-04-01_2019-06-30_HU.csv exists, reading from file
./data/loads/load_2019-07-01_2019-09-30_HU.csv exists, reading from file
./data/loads/load_2019-10-01_2019-12-31_HU.csv exists, reading from file
./data/base/base_wind_solar_forecast

In [90]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT
0,2018-12-31 23:00:00+00:00,59.90,4082.00,0.0,19.75,279.00,6110.0,0.0,88.0,33.48,6075.00,0.0,487.0
1,2019-01-01 00:00:00+00:00,52.71,3985.75,0.0,27.25,245.54,5856.0,0.0,95.0,39.76,5852.75,0.0,398.0
2,2019-01-01 01:00:00+00:00,36.49,3732.50,0.0,19.75,169.98,5675.0,0.0,90.0,39.78,5619.25,0.0,338.0
3,2019-01-01 02:00:00+00:00,31.24,3554.25,0.0,7.25,145.52,5570.0,0.0,86.0,27.87,5324.00,0.0,309.0
4,2019-01-01 03:00:00+00:00,25.98,3499.25,0.0,1.75,121.00,5524.0,0.0,83.0,-0.36,5273.50,0.0,320.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8664,2019-12-30 19:00:00+00:00,42.86,5007.25,0.0,48.50,204.86,7611.0,6.0,407.0,,,,
8665,2019-12-30 20:00:00+00:00,35.94,4930.75,0.0,48.50,171.78,7090.0,6.0,308.0,,,,
8666,2019-12-30 21:00:00+00:00,35.38,4805.75,0.0,50.75,169.12,6621.0,0.0,279.0,,,,
8667,2019-12-30 22:00:00+00:00,29.52,4578.25,0.0,44.25,141.12,6246.0,0.0,311.0,,,,


## Read in weather data

In [125]:
#create a filenames list for the weather data
weather_filenames = os.listdir(weather_folder_path)

#concat the weather data
for weather_filename in weather_filenames:
    if weather_filename == weather_filenames[0]:
        df_weather = pd.read_csv(f'{weather_folder_path}/{weather_filename}', index_col=0)
    else:
        df_temp = pd.read_csv(f'{weather_folder_path}/{weather_filename}', index_col=0)
        df_weather = pd.concat([df_weather, df_temp])


In [126]:
df_weather['Datetime'] = pd.to_datetime(df_weather.index)
df_weather.reset_index(drop=True, inplace=True)

In [127]:
df_weather

Unnamed: 0,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,Datetime
0,-1.579,89.836760,1031.0,1016.21246,0.0,0.0,17.1,5.0,1.0,40.0,0.0,0.0,0.0,0.0,6.109403,10.895577,315.000100,352.405430,11.159999,0.215500,93.328880,1027.5,1020.21160,0.0,0.0,14.400000,12.0,6.0,0.0,0.0,0.0,0.0,0.0,10.105681,20.326454,355.914460,22.932130,16.199999,2019-01-01 00:00:00+00:00
1,-1.579,89.836760,1031.1,1016.31100,0.0,0.0,5.4,5.0,0.0,3.0,0.0,0.0,0.0,0.0,5.991594,10.315115,302.735200,330.751280,11.879999,0.665500,94.718620,1028.1,1020.81920,0.0,0.0,29.100000,21.0,17.0,0.0,0.0,0.0,0.0,0.0,11.212135,22.862123,5.527458,26.161512,19.440000,2019-01-01 01:00:00+00:00
2,-1.879,90.484474,1031.2,1016.39350,0.0,0.0,7.8,8.0,1.0,0.0,0.0,0.0,0.0,0.0,6.120000,10.195057,298.072400,317.862460,10.799999,0.965500,93.707565,1027.9,1020.62880,0.0,0.0,17.400000,14.0,8.0,0.0,0.0,0.0,0.0,0.0,10.041354,21.370783,14.534496,32.619240,19.080000,2019-01-01 02:00:00+00:00
3,-1.779,89.820750,1030.7,1015.90607,0.0,0.0,5.4,6.0,0.0,0.0,0.0,0.0,0.0,0.0,8.217153,17.826363,298.810700,313.363460,15.119999,0.315500,93.674934,1027.6,1020.31354,0.0,0.0,26.099998,27.0,3.0,0.0,0.0,0.0,0.0,0.0,7.342588,18.218275,11.309895,37.775745,16.919998,2019-01-01 03:00:00+00:00
4,-2.279,90.116875,1030.1,1015.28766,0.0,0.0,29.7,1.0,0.0,96.0,0.0,0.0,0.0,0.0,6.439876,15.546833,296.564970,312.184360,15.119999,-0.134500,93.995240,1027.7,1020.40100,0.0,0.0,42.900000,45.0,4.0,0.0,0.0,0.0,0.0,0.0,7.421590,15.021105,14.036275,45.970932,11.879999,2019-01-01 04:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,9.521,66.184006,1020.8,1006.72900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.640994,17.024040,316.909150,326.645970,15.119999,9.615500,89.150010,1010.3,1003.37090,1.4,1.4,100.000000,100.0,100.0,88.0,0.0,0.0,0.0,0.0,28.771297,46.328240,305.579440,309.007480,50.039997,2024-09-30 19:00:00+00:00
50396,8.521,72.820656,1020.6,1006.48220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.279395,13.036026,332.700500,342.315520,12.599999,9.565499,89.145950,1010.1,1003.17110,2.0,2.0,100.000000,100.0,100.0,100.0,0.0,0.0,0.0,0.0,30.996084,49.531730,306.736900,310.578600,53.639996,2024-09-30 20:00:00+00:00
50397,7.871,78.544050,1020.2,1006.05550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.582902,10.771461,339.227660,350.380340,10.799999,9.515500,89.141860,1009.5,1002.57390,2.7,2.7,100.000000,99.0,100.0,100.0,0.0,0.0,0.0,0.0,32.497850,51.738020,309.155430,312.179750,56.160000,2024-09-30 21:00:00+00:00
50398,6.721,87.986680,1020.0,1005.80070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.421590,9.292254,22.833694,44.215275,10.080000,9.615500,89.150010,1009.1,1002.17914,2.4,2.4,100.000000,100.0,100.0,100.0,0.0,0.0,0.0,0.0,31.826931,50.800835,307.647670,310.689270,56.160000,2024-09-30 22:00:00+00:00


In [128]:
df_weather.isna().sum().sum()

0

In [129]:
df = pd.merge(df, df_weather, on='Datetime', how='inner')

In [130]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca
0,2024-01-02 00:00:00+00:00,29.48,4072.00,0.0,54.00,29.38,4628.25,4.0,1843.00,29.22,5589.50,0.0,766.0,0.821,93.022575,1015.2,1000.76575,0.0,0.0,29.400002,0.0,7.0,84.0,0.0,0.0,0.0,0.0,7.172949,20.160000,252.47433,270.00000,13.320000,6.5155,83.467310,1013.1,1006.07513,0.0,0.0,32.400000,0.0,4.0,100.0,0.0,0.0,0.0,0.0,17.819090,34.754450,278.13000,283.172550,29.160000
1,2024-01-02 01:00:00+00:00,19.67,3929.25,0.0,47.50,19.64,4534.00,4.0,1616.00,19.58,5328.50,0.0,646.0,1.321,89.413180,1015.1,1000.69324,0.0,0.0,34.500000,0.0,27.0,61.0,0.0,0.0,0.0,0.0,8.049845,21.674870,259.69522,274.76355,16.199999,7.3655,81.266884,1012.1,1005.10310,0.0,0.0,37.800003,0.0,13.0,100.0,0.0,0.0,0.0,0.0,17.731598,33.971870,275.82623,279.761700,29.519999
2,2024-01-02 02:00:00+00:00,27.92,3784.75,0.0,40.25,27.65,4487.25,4.0,1280.50,27.57,5226.25,0.0,533.0,0.971,88.412130,1015.0,1000.57640,0.0,0.0,16.500000,0.0,1.0,53.0,0.0,0.0,0.0,0.0,8.311245,20.892412,252.34980,271.97488,14.040000,7.2155,83.844280,1012.2,1005.19867,0.0,0.0,32.400000,0.0,4.0,100.0,0.0,0.0,0.0,0.0,17.280000,33.636406,270.00000,275.527440,30.599998
3,2024-01-02 03:00:00+00:00,16.54,3749.25,0.0,33.50,16.38,4485.00,4.0,935.00,15.67,5179.25,0.0,478.0,0.721,87.425125,1014.9,1000.46470,0.0,0.0,31.800001,0.0,5.0,96.0,0.0,0.0,0.0,0.0,7.862518,21.288757,254.05453,273.87845,14.040000,6.7155,87.680016,1012.4,1005.38500,0.0,0.0,37.200000,0.0,12.0,100.0,0.0,0.0,0.0,0.0,15.137133,30.242140,267.27374,270.682040,28.800000
4,2024-01-02 04:00:00+00:00,9.95,3875.75,0.0,30.50,9.85,4568.00,4.0,641.25,9.69,5374.50,0.0,467.0,0.771,86.475070,1014.7,1000.27000,0.0,0.0,29.400002,3.0,5.0,79.0,0.0,0.0,0.0,0.0,8.121970,21.129885,257.19574,278.82030,14.040000,6.1155,91.377220,1012.9,1005.86640,0.1,0.1,55.200005,0.0,42.0,100.0,0.0,0.0,0.0,0.0,8.396570,21.407139,300.96368,289.653900,25.560000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,2024-02-19 19:00:00+00:00,83.57,6265.75,0.0,160.25,90.42,7701.75,4.0,575.25,81.73,8359.00,0.0,1317.0,7.471,91.782330,1023.4,1009.19116,0.1,0.1,70.800000,20.0,45.0,86.0,0.0,0.0,0.0,0.0,11.013882,24.203140,281.30990,292.75100,18.359999,3.9155,69.876200,1026.9,1019.71280,0.0,0.0,71.100000,79.0,0.0,0.0,0.0,0.0,0.0,0.0,5.760000,12.240000,90.00000,90.000000,9.360000
1172,2024-02-19 20:00:00+00:00,75.51,6071.00,0.0,157.25,84.79,7288.50,4.0,632.00,73.97,7812.50,0.0,1416.0,7.421,91.148094,1023.4,1009.18854,0.0,0.0,10.200001,4.0,10.0,2.0,0.0,0.0,0.0,0.0,13.276144,27.002400,310.60123,317.16110,24.480000,2.4655,80.292870,1026.9,1019.67520,0.0,0.0,90.000000,100.0,0.0,0.0,0.0,0.0,0.0,0.0,6.034700,12.849528,107.35411,101.309900,9.000000
1173,2024-02-19 21:00:00+00:00,68.68,5828.25,0.0,150.50,68.72,6726.25,4.0,661.50,68.00,7195.75,0.0,1471.0,6.521,89.828030,1023.7,1009.43900,0.0,0.0,37.500000,8.0,37.0,27.0,0.0,0.0,0.0,0.0,11.019764,25.721740,308.36752,316.70132,23.759998,2.3655,84.164900,1026.9,1019.67255,0.0,0.0,90.000000,100.0,0.0,0.0,0.0,0.0,0.0,0.0,5.351785,12.682018,137.72638,124.592354,8.640000
1174,2024-02-19 22:00:00+00:00,66.27,5532.50,0.0,141.50,66.27,6233.25,4.0,665.50,66.27,6919.25,0.0,1475.0,5.921,87.606080,1023.8,1009.50726,0.0,0.0,29.400000,2.0,0.0,92.0,0.0,0.0,0.0,0.0,10.685391,25.711662,302.61923,314.43280,20.160000,3.3655,79.553910,1026.7,1019.49980,0.0,0.0,85.500000,95.0,0.0,0.0,0.0,0.0,0.0,0.0,6.120000,13.044722,151.92761,140.599370,10.440001


In [131]:
df['Datetime'].duplicated().sum()

0

# Create more input features

In [132]:
target_country_code = 'HU'

In [133]:
df['year'] = df['Datetime'].dt.year
df['month'] = df['Datetime'].dt.month
df['day'] = df['Datetime'].dt.day
df['hour'] = df['Datetime'].dt.hour
df['weekday'] = df['Datetime'].dt.weekday
df['dayoftheweek'] = df['Datetime'].dt.dayofweek
df['date'] = df['Datetime'].dt.date

In [134]:
month_dummies = pd.get_dummies(df['month'], prefix='month', drop_first=True)
dayofweek_dummies = pd.get_dummies(df['dayoftheweek'], prefix='dayofweek', drop_first=True)
df = pd.concat([df, month_dummies, dayofweek_dummies], axis=1)

In [135]:
dummy_columns = month_dummies.columns.tolist() + dayofweek_dummies.columns.tolist()

In [136]:
df['Datetime-2d'] = df['Datetime'] + pd.to_timedelta(-2, unit='day')
df['Datetime-7d'] = df['Datetime'] + pd.to_timedelta(-7, unit='day')

for country in country_codes:
    load_map = df.set_index('Datetime')[f'Price_{country}'].to_dict()

    df[f'Price_2d_{country}'] = df['Datetime-2d'].map(load_map)
    df[f'Price_7d_{country}'] = df['Datetime-7d'].map(load_map)


In [137]:
df.isnull().sum()

Datetime                 0
Price_HU                 0
Load_avg_HU              0
Solar_Fcast_avg_HU       0
Wind_Onshore_avg_HU      0
                      ... 
Price_7d_HU            168
Price_2d_RO             48
Price_7d_RO            168
Price_2d_AT             48
Price_7d_AT            168
Length: 73, dtype: int64

In [138]:
df = df.dropna().reset_index(drop=True)

In [139]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,year,month,day,hour,weekday,dayoftheweek,date,month_2,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,Datetime-2d,Datetime-7d,Price_2d_HU,Price_7d_HU,Price_2d_RO,Price_7d_RO,Price_2d_AT,Price_7d_AT
0,2024-01-09 00:00:00+00:00,87.55,5368.25,0.0,134.25,87.53,6092.00,3.0,2338.25,87.36,7304.75,0.0,2422.0,-3.129,46.183468,1026.1,1011.29913,0.0,0.0,10.500001,2.0,14.0,1.0,0.0,0.0,0.0,0.0,14.561099,26.282465,8.530692,13.465230,25.560000,-6.5345,66.552660,1021.3,1013.87305,0.0,0.0,39.3,31.0,19.0,0.0,0.0,0.0,0.0,0.0,18.000000,27.859905,323.13000,327.131320,34.560000,2024,1,9,0,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 00:00:00+00:00,2024-01-02 00:00:00+00:00,84.08,29.48,84.08,29.38,84.08,29.22
1,2024-01-09 01:00:00+00:00,86.43,5147.75,0.0,128.50,86.18,5966.75,3.0,2312.75,86.22,6982.75,0.0,2285.0,-1.429,41.731754,1026.3,1011.58810,0.0,0.0,100.000000,80.0,100.0,12.0,0.0,0.0,0.0,0.0,14.759999,25.071096,12.680387,15.832451,29.880000,-7.1845,62.751854,1022.9,1015.44330,0.0,0.0,100.0,45.0,94.0,100.0,0.0,0.0,0.0,0.0,20.140705,29.686360,335.72556,337.166300,36.719997,2024,1,9,1,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 01:00:00+00:00,2024-01-02 01:00:00+00:00,79.82,19.67,79.82,19.64,79.82,19.58
2,2024-01-09 02:00:00+00:00,85.00,4972.00,0.0,122.50,85.00,5942.25,3.0,2301.00,85.00,6894.00,0.0,2152.0,-1.629,38.573810,1026.7,1011.97144,0.0,0.0,100.000000,82.0,100.0,36.0,0.0,0.0,0.0,0.0,14.058450,22.450264,13.324542,15.819268,26.280000,-6.8845,62.827103,1023.0,1015.55084,0.0,0.0,100.0,92.0,91.0,99.0,0.0,0.0,0.0,0.0,19.665360,28.883131,336.25050,337.270230,34.920000,2024,1,9,2,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 02:00:00+00:00,2024-01-02 02:00:00+00:00,76.76,27.92,76.76,27.65,76.76,27.57
3,2024-01-09 03:00:00+00:00,84.10,4917.75,0.0,116.50,84.08,5959.00,3.0,2305.00,84.09,6768.50,0.0,2025.0,-1.829,38.986088,1027.2,1012.45340,0.0,0.0,100.000000,73.0,100.0,10.0,0.0,0.0,0.0,0.0,11.966953,17.873556,6.911131,9.272532,25.560000,-6.3845,62.196503,1023.1,1015.66406,0.0,0.0,100.0,96.0,93.0,99.0,0.0,0.0,0.0,0.0,20.523157,29.495842,338.38516,340.016800,36.360000,2024,1,9,3,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 03:00:00+00:00,2024-01-02 03:00:00+00:00,73.46,16.54,73.46,16.38,73.46,15.67
4,2024-01-09 04:00:00+00:00,85.17,5056.50,0.0,108.50,85.15,6156.75,3.0,2308.25,85.01,6904.75,0.0,1915.0,-1.979,39.742280,1027.6,1012.83990,0.0,0.0,90.900000,35.0,98.0,2.0,0.0,0.0,0.0,0.0,10.895577,16.935310,352.405430,357.563400,21.599998,-6.0345,63.039200,1023.6,1016.17040,0.0,0.0,100.0,100.0,66.0,30.0,0.0,0.0,0.0,0.0,21.659918,31.520050,344.57776,346.122440,38.519997,2024,1,9,4,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 04:00:00+00:00,2024-01-02 04:00:00+00:00,71.86,9.95,71.86,9.85,71.86,9.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,2024-02-19 19:00:00+00:00,83.57,6265.75,0.0,160.25,90.42,7701.75,4.0,575.25,81.73,8359.00,0.0,1317.0,7.471,91.782330,1023.4,1009.19116,0.1,0.1,70.800000,20.0,45.0,86.0,0.0,0.0,0.0,0.0,11.013882,24.203140,281.309900,292.751000,18.359999,3.9155,69.876200,1026.9,1019.71280,0.0,0.0,71.1,79.0,0.0,0.0,0.0,0.0,0.0,0.0,5.760000,12.240000,90.00000,90.000000,9.360000,2024,2,19,19,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 19:00:00+00:00,2024-02-12 19:00:00+00:00,89.57,89.08,89.47,89.08,89.22,89.08
1004,2024-02-19 20:00:00+00:00,75.51,6071.00,0.0,157.25,84.79,7288.50,4.0,632.00,73.97,7812.50,0.0,1416.0,7.421,91.148094,1023.4,1009.18854,0.0,0.0,10.200001,4.0,10.0,2.0,0.0,0.0,0.0,0.0,13.276144,27.002400,310.601230,317.161100,24.480000,2.4655,80.292870,1026.9,1019.67520,0.0,0.0,90.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,6.034700,12.849528,107.35411,101.309900,9.000000,2024,2,19,20,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 20:00:00+00:00,2024-02-12 20:00:00+00:00,80.41,79.15,80.38,79.15,80.09,79.15
1005,2024-02-19 21:00:00+00:00,68.68,5828.25,0.0,150.50,68.72,6726.25,4.0,661.50,68.00,7195.75,0.0,1471.0,6.521,89.828030,1023.7,1009.43900,0.0,0.0,37.500000,8.0,37.0,27.0,0.0,0.0,0.0,0.0,11.019764,25.721740,308.367520,316.701320,23.759998,2.3655,84.164900,1026.9,1019.67255,0.0,0.0,90.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,5.351785,12.682018,137.72638,124.592354,8.640000,2024,2,19,21,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 21:00:00+00:00,2024-02-12 21:00:00+00:00,74.30,71.54,74.30,71.54,74.04,71.54
1006,2024-02-19 22:00:00+00:00,66.27,5532.50,0.0,141.50,66.27,6233.25,4.0,665.50,66.27,6919.25,0.0,1475.0,5.921,87.606080,1023.8,1009.50726,0.0,0.0,29.400000,2.0,0.0,92.0,0.0,0.0,0.0,0.0,10.685391,25.711662,302.619230,314.432800,20.160000,3.3655,79.553910,1026.7,1019.49980,0.0,0.0,85.5,95.0,0.0,0.0,0.0,0.0,0.0,0.0,6.120000,13.044722,151.92761,140.599370,10.440001,2024,2,19,22,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 22:00:00+00:00,2024-02-12 22:00:00+00:00,72.05,65.87,72.00,65.87,71.86,65.87


In [140]:
from workalendar.europe import Hungary, Romania, Austria

cal_hu = Hungary()
cal_ro = Romania()
cal_at = Austria()
year_list = df['Datetime'].dt.year.unique()

In [141]:
holiday_df = pd.DataFrame()
for cal, cntry in [(cal_hu, 'HU'), (cal_ro, 'RO'), (cal_at, 'AT')]:
    for year in year_list:
        holidays = cal.holidays(year)
        temp_df = pd.DataFrame(holidays, columns=['date', f'holiday_name_{cntry}'])
        holiday_df = pd.concat([holiday_df, temp_df],
                            axis=0).reset_index(drop=True)

In [142]:
holiday_df

Unnamed: 0,date,holiday_name_HU,holiday_name_RO,holiday_name_AT
0,2024-01-01,New year,,
1,2024-03-15,National Day,,
2,2024-03-29,Good Friday,,
3,2024-03-31,Easter Sunday,,
4,2024-04-01,Easter Monday,,
5,2024-05-01,Labour Day,,
6,2024-05-19,Pentecost Sunday,,
7,2024-05-20,Pentecost Monday,,
8,2024-08-20,St Stephen's Day,,
9,2024-10-23,National Day,,


In [143]:
holiday_df.set_index('date', inplace=True)

for col in holiday_df.columns:
    holiday_map = holiday_df[col].to_dict()
    df[col] = df['date'].map(holiday_map)
    cntry = col.split('_')[-1]
    df[f'is_holiday_{cntry}'] = 0
    df.loc[df[f'is_holiday_{cntry}'].notnull(), f'is_holiday_{cntry}'] = 1

    df.drop(columns=[col], inplace=True)

In [144]:
for col in holiday_df.columns:
    cntry = col.split('_')[-1]
    df[f'is_workday_{cntry}'] = 1
    df.loc[df[f'is_holiday_{cntry}'] == 1, f'is_workday_{cntry}'] = 0
    df.loc[df['dayoftheweek'].isin([5, 6]), f'is_workday_{cntry}'] = 0

In [145]:
df

Unnamed: 0,Datetime,Price_HU,Load_avg_HU,Solar_Fcast_avg_HU,Wind_Onshore_avg_HU,Price_RO,Load_avg_RO,Solar_Fcast_avg_RO,Wind_Onshore_avg_RO,Price_AT,Load_avg_AT,Solar_Fcast_avg_AT,Wind_Onshore_avg_AT,temperature_2m_BP,relative_humidity_2m_BP,pressure_msl_BP,surface_pressure_BP,precipitation_BP,rain_BP,cloud_cover_BP,cloud_cover_low_BP,cloud_cover_mid_BP,cloud_cover_high_BP,shortwave_radiation_BP,direct_radiation_BP,diffuse_radiation_BP,sunshine_duration_BP,wind_speed_10m_BP,wind_speed_100m_BP,wind_direction_10m_BP,wind_direction_100m_BP,wind_gusts_10m_BP,temperature_2m_Konstanca,relative_humidity_2m_Konstanca,pressure_msl_Konstanca,surface_pressure_Konstanca,precipitation_Konstanca,rain_Konstanca,cloud_cover_Konstanca,cloud_cover_low_Konstanca,cloud_cover_mid_Konstanca,cloud_cover_high_Konstanca,shortwave_radiation_Konstanca,direct_radiation_Konstanca,diffuse_radiation_Konstanca,sunshine_duration_Konstanca,wind_speed_10m_Konstanca,wind_speed_100m_Konstanca,wind_direction_10m_Konstanca,wind_direction_100m_Konstanca,wind_gusts_10m_Konstanca,year,month,day,hour,weekday,dayoftheweek,date,month_2,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,Datetime-2d,Datetime-7d,Price_2d_HU,Price_7d_HU,Price_2d_RO,Price_7d_RO,Price_2d_AT,Price_7d_AT,is_holiday_HU,is_holiday_RO,is_holiday_AT,is_workday_HU,is_workday_RO,is_workday_AT
0,2024-01-09 00:00:00+00:00,87.55,5368.25,0.0,134.25,87.53,6092.00,3.0,2338.25,87.36,7304.75,0.0,2422.0,-3.129,46.183468,1026.1,1011.29913,0.0,0.0,10.500001,2.0,14.0,1.0,0.0,0.0,0.0,0.0,14.561099,26.282465,8.530692,13.465230,25.560000,-6.5345,66.552660,1021.3,1013.87305,0.0,0.0,39.3,31.0,19.0,0.0,0.0,0.0,0.0,0.0,18.000000,27.859905,323.13000,327.131320,34.560000,2024,1,9,0,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 00:00:00+00:00,2024-01-02 00:00:00+00:00,84.08,29.48,84.08,29.38,84.08,29.22,1,1,1,0,0,0
1,2024-01-09 01:00:00+00:00,86.43,5147.75,0.0,128.50,86.18,5966.75,3.0,2312.75,86.22,6982.75,0.0,2285.0,-1.429,41.731754,1026.3,1011.58810,0.0,0.0,100.000000,80.0,100.0,12.0,0.0,0.0,0.0,0.0,14.759999,25.071096,12.680387,15.832451,29.880000,-7.1845,62.751854,1022.9,1015.44330,0.0,0.0,100.0,45.0,94.0,100.0,0.0,0.0,0.0,0.0,20.140705,29.686360,335.72556,337.166300,36.719997,2024,1,9,1,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 01:00:00+00:00,2024-01-02 01:00:00+00:00,79.82,19.67,79.82,19.64,79.82,19.58,1,1,1,0,0,0
2,2024-01-09 02:00:00+00:00,85.00,4972.00,0.0,122.50,85.00,5942.25,3.0,2301.00,85.00,6894.00,0.0,2152.0,-1.629,38.573810,1026.7,1011.97144,0.0,0.0,100.000000,82.0,100.0,36.0,0.0,0.0,0.0,0.0,14.058450,22.450264,13.324542,15.819268,26.280000,-6.8845,62.827103,1023.0,1015.55084,0.0,0.0,100.0,92.0,91.0,99.0,0.0,0.0,0.0,0.0,19.665360,28.883131,336.25050,337.270230,34.920000,2024,1,9,2,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 02:00:00+00:00,2024-01-02 02:00:00+00:00,76.76,27.92,76.76,27.65,76.76,27.57,1,1,1,0,0,0
3,2024-01-09 03:00:00+00:00,84.10,4917.75,0.0,116.50,84.08,5959.00,3.0,2305.00,84.09,6768.50,0.0,2025.0,-1.829,38.986088,1027.2,1012.45340,0.0,0.0,100.000000,73.0,100.0,10.0,0.0,0.0,0.0,0.0,11.966953,17.873556,6.911131,9.272532,25.560000,-6.3845,62.196503,1023.1,1015.66406,0.0,0.0,100.0,96.0,93.0,99.0,0.0,0.0,0.0,0.0,20.523157,29.495842,338.38516,340.016800,36.360000,2024,1,9,3,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 03:00:00+00:00,2024-01-02 03:00:00+00:00,73.46,16.54,73.46,16.38,73.46,15.67,1,1,1,0,0,0
4,2024-01-09 04:00:00+00:00,85.17,5056.50,0.0,108.50,85.15,6156.75,3.0,2308.25,85.01,6904.75,0.0,1915.0,-1.979,39.742280,1027.6,1012.83990,0.0,0.0,90.900000,35.0,98.0,2.0,0.0,0.0,0.0,0.0,10.895577,16.935310,352.405430,357.563400,21.599998,-6.0345,63.039200,1023.6,1016.17040,0.0,0.0,100.0,100.0,66.0,30.0,0.0,0.0,0.0,0.0,21.659918,31.520050,344.57776,346.122440,38.519997,2024,1,9,4,1,1,2024-01-09,False,True,False,False,False,False,False,2024-01-07 04:00:00+00:00,2024-01-02 04:00:00+00:00,71.86,9.95,71.86,9.85,71.86,9.69,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,2024-02-19 19:00:00+00:00,83.57,6265.75,0.0,160.25,90.42,7701.75,4.0,575.25,81.73,8359.00,0.0,1317.0,7.471,91.782330,1023.4,1009.19116,0.1,0.1,70.800000,20.0,45.0,86.0,0.0,0.0,0.0,0.0,11.013882,24.203140,281.309900,292.751000,18.359999,3.9155,69.876200,1026.9,1019.71280,0.0,0.0,71.1,79.0,0.0,0.0,0.0,0.0,0.0,0.0,5.760000,12.240000,90.00000,90.000000,9.360000,2024,2,19,19,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 19:00:00+00:00,2024-02-12 19:00:00+00:00,89.57,89.08,89.47,89.08,89.22,89.08,1,1,1,0,0,0
1004,2024-02-19 20:00:00+00:00,75.51,6071.00,0.0,157.25,84.79,7288.50,4.0,632.00,73.97,7812.50,0.0,1416.0,7.421,91.148094,1023.4,1009.18854,0.0,0.0,10.200001,4.0,10.0,2.0,0.0,0.0,0.0,0.0,13.276144,27.002400,310.601230,317.161100,24.480000,2.4655,80.292870,1026.9,1019.67520,0.0,0.0,90.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,6.034700,12.849528,107.35411,101.309900,9.000000,2024,2,19,20,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 20:00:00+00:00,2024-02-12 20:00:00+00:00,80.41,79.15,80.38,79.15,80.09,79.15,1,1,1,0,0,0
1005,2024-02-19 21:00:00+00:00,68.68,5828.25,0.0,150.50,68.72,6726.25,4.0,661.50,68.00,7195.75,0.0,1471.0,6.521,89.828030,1023.7,1009.43900,0.0,0.0,37.500000,8.0,37.0,27.0,0.0,0.0,0.0,0.0,11.019764,25.721740,308.367520,316.701320,23.759998,2.3655,84.164900,1026.9,1019.67255,0.0,0.0,90.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,5.351785,12.682018,137.72638,124.592354,8.640000,2024,2,19,21,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 21:00:00+00:00,2024-02-12 21:00:00+00:00,74.30,71.54,74.30,71.54,74.04,71.54,1,1,1,0,0,0
1006,2024-02-19 22:00:00+00:00,66.27,5532.50,0.0,141.50,66.27,6233.25,4.0,665.50,66.27,6919.25,0.0,1475.0,5.921,87.606080,1023.8,1009.50726,0.0,0.0,29.400000,2.0,0.0,92.0,0.0,0.0,0.0,0.0,10.685391,25.711662,302.619230,314.432800,20.160000,3.3655,79.553910,1026.7,1019.49980,0.0,0.0,85.5,95.0,0.0,0.0,0.0,0.0,0.0,0.0,6.120000,13.044722,151.92761,140.599370,10.440001,2024,2,19,22,0,0,2024-02-19,True,False,False,False,False,False,False,2024-02-17 22:00:00+00:00,2024-02-12 22:00:00+00:00,72.05,65.87,72.00,65.87,71.86,65.87,1,1,1,0,0,0


# Save the whole dataset

In [146]:
df.to_csv(f'./data/all_data.csv')

## Baseline

The baseline model is just shifting the prices by 1 week

In [1]:
def baseline_model(df):
    y_pred = df[f'Price_{target_country_code}'].shift(-7*24)
    return y_pred

In [None]:
base_pred = baseline_model(df)
start_date = '2023-01-01'
end_date = '2024-03-17'
base_eval(df.loc[df['Datetime'] >= test_end_date, target_col], base_pred.loc[df['Datetime'] >= test_end_date])
weighted_eval(df.loc[df['Datetime'] >= test_end_date, target_col], base_pred.loc[df['Datetime'] >= test_end_date], df.loc[df['Datetime'] >= test_end_date, f'Load_avg_{target_country_code}'])

In [None]:
plot_predictions(df.loc[df['Datetime'] >= test_end_date, target_col].to_list(), base_pred.loc[df['Datetime'] >= test_end_date].to_list(), 'Baseline model')

## Baseline v2

The predicted value is based on the most similar hour in the history based on the input columns.

In [41]:
from sklearn.neighbors import NearestNeighbors

In [None]:
base2_cols = df.columns.tolist()
base2_cols = [col for col in base2_cols if col not in do_not_use_cols]
base2_cols.remove('Price_2d_HU')
base2_cols.remove('Price_7d_HU')
base2_cols

In [48]:
K = 15
METRIC = "cosine"

def get_neighbors(index):
    curr_date = df.loc[index, 'date']
    X = df.loc[df['date'] < curr_date, base2_cols].values
    knn = NearestNeighbors(n_neighbors=K, metric=METRIC).fit(X)
    
    _, indices = knn.kneighbors(df.loc[index, base2_cols].values.reshape(1, -1))
    indices = indices[0].tolist()
    return [i for i in indices if df.loc[i, 'is_workday'] == df.loc[index, 'is_workday']]

In [49]:
def baseline_model2(df, date, target_col=target_col):
    datetimes = df.loc[df['date'] >= date, 'Datetime']
    y_pred = []
    for dt in datetimes:
        idx = df[df['Datetime'] == dt].index[0]
        neighbors = get_neighbors(idx)
        y_pred.append(df.loc[neighbors, target_col].mean())
    return y_pred

In [None]:
test_end_date_base2 = pd.Timestamp(test_end_date).date()
base2_pred = baseline_model2(df, test_end_date_base2)
base2_true = df.loc[df['date'] >= test_end_date_base2, target_col]
base_eval(base2_true, base2_pred)
weighted_eval(base2_true, base2_pred, df.loc[df['date'] >= test_end_date_base2, f'Load_avg_{target_country_code}'])

In [None]:
plot_predictions(base2_true.to_list(), base2_pred, 'Baseline model v2')

## Recurrent Neural Network

In [74]:
#recurrent neural network
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Input
import tensorflow as tf

def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 24

X_train_lstm, y_train_lstm = create_dataset(X_train, y_train, time_steps)
X_test_lstm, y_test_lstm = create_dataset(X_test, y_test, time_steps)

In [75]:
X_train_lstm = X_train_lstm.astype('float32')
y_train_lstm = y_train_lstm.astype('float32')
X_test_lstm = X_test_lstm.astype('float32')
y_test_lstm = y_test_lstm.astype('float32')

In [76]:
lstm = Sequential()
lstm.add(Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm.add(LSTM(units=64))
lstm.add(Dropout(rate=0.2))
lstm.add(Dense(units=1))
lstm.compile(loss='mean_squared_error', optimizer='adam')

In [77]:
X_train_lstm = tf.convert_to_tensor(X_train_lstm)
X_test_lstm = tf.convert_to_tensor(X_test_lstm)
y_train_lstm = tf.convert_to_tensor(y_train_lstm)
y_test_lstm = tf.convert_to_tensor(y_test_lstm)

In [None]:
history = lstm.fit(
    X_train_lstm, y_train_lstm,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    shuffle=False
)

In [None]:
lstm_pred = lstm.predict(X_test_lstm)

lstm_pred = lstm_pred.reshape(lstm_pred.shape[0])

In [None]:
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Loss')
plt.legend()

# Notes

Ma 8as adatokkal 9ig a holnapit (a mait mind ismerem)
- baseline1 az 1 heti adat
- baseline2 hasonló időjárású nap

Kiértékelés:
- v1 abs hiba (hány eurót tévedtünk)
- v2 adott órában mennyi a load (termelés / fogyasztás), hiba súlyozva a teljes fogyasztással

Opciók
- recurrent nn predictor (?)
- gbm regressor
-- (előzö napi adatok, napelemek termelése, román adatok, hőmérséklet..., körny ország árai)
-- walk forward opt

keretrendszer
feture inportance alapján feature selection 
- változásuk követése !!!

(talán osztrák is számít, meg kell nézni melyik számít)

időjárási adatok (első körben tényadatok, nem előrejelzés) próbálkozni kell, drága lehet, kb kizárt 

- végén fontos és ***nem fontos*** változók listája

3 fontos időjárás (régiós, a napi bontás is jó)
- hány fok van (fűtés / hűtés)
- besugárzás
- szélerősség


-Hányszor volt negatív ár - statisztika róla (Meg tudjuk-e mondani, hogy mikor lesz negatív ár)
-- Ez is lehet célváltozó és kiértékelés

- Napi egy órát kikapcsoljuk, cél: mikor legyen (mert a többi órában többet tudunk termelni)
-- Meg lehet nézni, hogy melyik lesz a legdrágább óra
