## Data Cleaning

### Day-Ahead Price

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import csv

In [7]:
def p_c(df):
    
    # Identify cycles by detecting where 'position' restarts
    df['group'] = (df['position'] == 1).cumsum() - 1
    
    # Create a DataFrame with the complete range of positions from 1 to 24, repeated for each cycle (group)
    num_groups = df['group'].max() + 1
    print('num of groups:', num_groups)
    full_positions = pd.DataFrame({
        'position': list(range(1, 25)) * num_groups,
        'group': np.repeat(range(num_groups), 24)
    })
    
    # Merge the original DataFrame with the full positions to ensure all positions are included in each group
    df_fixed = full_positions.merge(df, on=['position', 'group'], how='left')
    
    # Forward fill missing 'price' values within each group
    df_fixed['price'] = df_fixed.groupby('group')['price.amount'].ffill()
    
    # Drop the 'group' column if it's not needed anymore
    df_fixed = df_fixed.drop(columns=['group'])
    return df_fixed

In [8]:
def read_price_xml(year):
    data_xml = pd.DataFrame()
    for i in range(1,5):
       df = pd.read_xml(f'raw/day_ahead_price/price_{year}_{i}.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'}) 
       data_xml = pd.concat([data_xml,df], axis=0, ignore_index=True) 
    return data_xml

In [9]:
def read_price(start_year = 2019, end_year = 2019):
    data = pd.DataFrame()
    for i in range(start_year, end_year + 1):
        df_xml = read_price_xml(2019)
        df= p_c(df_xml)
    data = pd.concat([data,df], axis=0, ignore_index=True) 
    return df

In [10]:
df_price = read_price(2019, 2022)

num of groups: 365
num of groups: 365
num of groups: 365
num of groups: 365


In [355]:
df_19_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   position      8760 non-null   int64  
 1   price.amount  8679 non-null   float64
 2   price         8760 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 205.4 KB


In [97]:
df_fin = pd.concat([df_19_fixed,df_20_fixed, df_21_fixed, df_22_fixed, df_23_fixed], axis=0, ignore_index=True) 

In [98]:
df_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   position      43824 non-null  int64  
 1   price.amount  43500 non-null  float64
 2   price         43824 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.0 MB


In [99]:
# Generate an hourly DatetimeIndex from 2019-01-01 to 2021-12-31
date_index = pd.date_range(start="2019-01-01 01:00:00", end="2024-01-01 00:00:00", freq="h")

# Set the generated DatetimeIndex as the index for the DataFrame
df_fin.index = date_index

In [100]:
df_fin.tail()

Unnamed: 0,position,price.amount,price
2023-12-31 20:00:00,20,39.64,39.64
2023-12-31 21:00:00,21,34.89,34.89
2023-12-31 22:00:00,22,13.3,13.3
2023-12-31 23:00:00,23,10.68,10.68
2024-01-01 00:00:00,24,3.17,3.17


In [105]:
df_fin.loc['2022-05-20']

Unnamed: 0,position,price.amount,price
2022-05-20 00:00:00,24,213.31,213.31
2022-05-20 01:00:00,1,211.1,211.1
2022-05-20 02:00:00,2,195.98,195.98
2022-05-20 03:00:00,3,189.56,189.56
2022-05-20 04:00:00,4,188.0,188.0
2022-05-20 05:00:00,5,189.56,189.56
2022-05-20 06:00:00,6,208.62,208.62
2022-05-20 07:00:00,7,236.04,236.04
2022-05-20 08:00:00,8,263.32,263.32
2022-05-20 09:00:00,9,250.0,250.0


In [101]:
df_fin.to_pickle('output/df_price_19_23.pkl')

TODO: 23data

- {'start': '2022-03-26T23:00Z', 'end': '2022-03-27T22:00Z'}
- {'start': '2022-10-29T22:00Z', 'end': '2022-10-30T23:00Z'}
- {'start': '2023-03-25T23:00Z', 'end': '2023-03-26T22:00Z'}
- {'start': '2023-10-28T22:00Z', 'end': '2023-10-29T23:00Z'}
- {'start': '2024-03-30T23:00Z', 'end': '2024-03-31T22:00Z'}

### Forecast Energy Generation

#### Solar B16

In [11]:
date_series_19 = pd.date_range(start='2019-01-01 01:00', end='2020-01-01 00:00', freq='1h')
date_series_20 = pd.date_range(start='2020-01-01 01:00', end='2021-01-01 00:00', freq='1h')
date_series_21 = pd.date_range(start='2021-01-01 01:00', end='2022-01-01 00:00', freq='1h')
date_series_22 = pd.date_range(start='2022-01-01 01:00', end='2023-01-01 00:00', freq='1h')
date_series_23 = pd.date_range(start='2023-01-01 01:00', end='2024-01-01 00:00', freq='1h')

In [5]:
# t = date_series_23[date_series_23 < pd.to_datetime('2023-11-14') and date_series_23 > pd.to_datetime('2023-11-14')]
# t = date_series_23[(date_series_23 < pd.to_datetime('2023-11-13 23:30:00')) | (date_series_23 > pd.to_datetime('2023-11-14 23:30:00'))]

In [12]:
def min2h(df_xml, date_series):
    df = df_xml.copy()
    df.drop('position', axis = 1, inplace = True)
    # df.rename(columns={'quantity': col_name}, inplace=True)
    df_hourly_data = df.groupby(df.index // 4).sum()
    df_hourly_data['Date'] = date_series
    df_hourly_data.set_index('Date', inplace= True)
    # df_hourly_data.rename(columns={'quantity': 'Alpha'}, inplace=True)
    return df_hourly_data

In [14]:
def missing_1114(df):
    
    df_1113 = df[df.index.date == pd.to_datetime('2023-11-13').date()]
    df_1115 = df[df.index.date == pd.to_datetime('2023-11-15').date()]
    df_1114_avg= (df_1113['quantity'].values +  df_1115['quantity'].values) / 2
    df_1114_avg = np.array([int(x) for x in df_1114_avg])
    df_1114_avg = df_1114_avg.reshape(24,1)
    df[df.index.date == pd.to_datetime('2023-11-14').date()] = df_1114_avg
    return df

## Energy Generation

In [66]:
def gene(folder, energy_type, start_year, end_year):
    data = pd.DataFrame()
    for year in range(start_year, end_year +1 ):
        print(year)
        if year == 24:
            date_series = pd.date_range(start='2024-01-01 01:00', end='2024-11-01 00:00', freq='1h')
        else:
            date_series = pd.date_range(start=f'20{year}-01-01 01:00', end=f'20{year+1}-01-01 00:00', freq='1h')
        # print(date_series[-5:])
        # print('date_series',len(date_series))
        df_xml = pd.read_xml(f'raw/{folder}/{energy_type}/a_{energy_type}_{year}.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0'})
        df_xml = df_xml[: int(len(df_xml)/2)]
        # print('df_xml',len(df_xml))
        # print(df_xml.tail())
        df_hourly = min2h(df_xml, date_series)
        # print(df_hourly.tail())
        # if year == 23 and folder != 'total_load':
        #     print('Fill missing 2023-11-14')
        #     df_hourly = missing_1114(df_hourly)
        data = pd.concat([data,df_hourly], axis=0) 
    data.to_pickle(f'output/{folder}/{folder}_{energy_type}_{start_year}_{end_year}.pkl')
    print('data saved!')
    # return data

In [84]:
generations  = ['offshore', 'onshore', 'solar', 'fossil_gas', 'fossil_hard_coal']
for g in generations:
    print('Energy from', g)
    gene('actual',g, 22, 24)

Energy from offshore
22
23
24
data saved!
Energy from onshore
22
23
24
data saved!
Energy from solar
22
23
24
data saved!
Energy from fossil_gas
22
23
24
data saved!
Energy from fossil_hard_coal
22
23
24
data saved!


## All

In [85]:
# df_price = pd.read_pickle('output/actual/df_price_12_24.pkl')
# df_price.drop(columns= ['position', 'price.amount'], inplace = True)
df_solar = pd.read_pickle('output/actual/actual_solar_22_24.pkl')
df_onshore = pd.read_pickle('output/actual/actual_onshore_22_24.pkl')
df_offshore = pd.read_pickle('output/actual/actual_offshore_22_24.pkl')
df_fossil_gas = pd.read_pickle('output/actual/actual_fossil_gas_22_24.pkl')
df_fossil_hard_coal = pd.read_pickle('output/actual/actual_fossil_hard_coal_22_24.pkl')
# df_total_load = pd.read_pickle('output/actual/df_total_load_19_23.pkl')

In [86]:
# df_price.columns = ['price']
df_solar.columns = ['actual_solar']
df_onshore.columns = ['actual_wind_onshore']
df_offshore.columns = ['actual_wind_offshore']
df_fossil_gas.columns = ['actual_fossil_gas']
df_fossil_hard_coal.columns = ['actual_fossil_hard_coal']
# df_total_load.columns = ['total_load']

In [87]:
df_actual = pd.concat([df_solar, df_onshore, df_offshore, df_fossil_gas, df_fossil_hard_coal], axis=1)

In [88]:
df_actual.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 24840 entries, 2022-01-01 01:00:00 to 2024-11-01 00:00:00
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   actual_solar             24840 non-null  int64
 1   actual_wind_onshore      24840 non-null  int64
 2   actual_wind_offshore     24840 non-null  int64
 3   actual_fossil_gas        24840 non-null  int64
 4   actual_fossil_hard_coal  24840 non-null  int64
dtypes: int64(5)
memory usage: 1.1 MB


In [89]:
df_actual.to_pickle('output/one/generation_actual_22_24.pkl')

In [90]:
# 2019: Sun, Mar 31, 2019 â€“ Sun, Oct 27, 2019

## cleaning

In [91]:
actual = pd.read_pickle('output/one/generation_actual_22_24.pkl')

In [92]:
ori = pd.read_pickle('output/one/df_19_24_cleaned.pkl')

In [93]:
actual.tail()

Unnamed: 0_level_0,actual_solar,actual_wind_onshore,actual_wind_offshore,actual_fossil_gas,actual_fossil_hard_coal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-31 20:00:00,4,1552,4635,24525,11031
2024-10-31 21:00:00,4,1313,4741,24010,10758
2024-10-31 22:00:00,4,1396,4411,22337,10524
2024-10-31 23:00:00,4,1345,4343,20718,9902
2024-11-01 00:00:00,4,1482,4671,17489,9730


In [94]:
ori.tail()

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load
2024-10-31 20:00:00,131.1,0,2943,934,58152
2024-10-31 21:00:00,110.0,0,3190,955,53694
2024-10-31 22:00:00,107.0,0,3445,1023,50703
2024-10-31 23:00:00,107.4,0,3599,1141,48258
2024-11-01 00:00:00,101.72,0,3540,1206,42045


In [95]:
# df = pd.concat([ori, actual], axis = 0)
df = ori.join(actual, how='inner') 

In [99]:
df.tail(5)

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load,actual_solar,actual_wind_onshore,actual_wind_offshore,actual_fossil_gas,actual_fossil_hard_coal
2024-10-31 20:00:00,131.1,0,2943,934,58152,4,1552,4635,24525,11031
2024-10-31 21:00:00,110.0,0,3190,955,53694,4,1313,4741,24010,10758
2024-10-31 22:00:00,107.0,0,3445,1023,50703,4,1396,4411,22337,10524
2024-10-31 23:00:00,107.4,0,3599,1141,48258,4,1345,4343,20718,9902
2024-11-01 00:00:00,101.72,0,3540,1206,42045,4,1482,4671,17489,9730


In [98]:
df.shape

(24840, 10)

In [365]:
data_cleaned.to_pickle('output/df_22_23_cleaned.pkl')