## Data Cleaning 2024

### Day-Ahead Price

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import csv

In [2]:
df_p_24_1 = pd.read_xml('raw/day_ahead_price/price_2024_1.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'})
df_p_24_2 = pd.read_xml('raw/day_ahead_price/price_2024_2.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'})
df_p_24_3 = pd.read_xml('raw/day_ahead_price/price_2024_3.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'})
df_p_24_4 = pd.read_xml('raw/day_ahead_price/price_2024_4.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'})

In [3]:
def p_c(df):
    
    # Identify cycles by detecting where 'position' restarts
    df['group'] = (df['position'] == 1).cumsum() - 1
    
    # Create a DataFrame with the complete range of positions from 1 to 24, repeated for each cycle (group)
    num_groups = df['group'].max() + 1
    print('num of groups:', num_groups)
    full_positions = pd.DataFrame({
        'position': list(range(1, 25)) * num_groups,
        'group': np.repeat(range(num_groups), 24)
    })
    
    # Merge the original DataFrame with the full positions to ensure all positions are included in each group
    df_fixed = full_positions.merge(df, on=['position', 'group'], how='left')
    
    # Forward fill missing 'price' values within each group
    df_fixed['price'] = df_fixed.groupby('group')['price.amount'].ffill()
    
    # Drop the 'group' column if it's not needed anymore
    df_fixed = df_fixed.drop(columns=['group'])
    return df_fixed

In [4]:
def read_price_xml(year):
    data_xml = pd.DataFrame()
    for i in range(1,5):
       df = pd.read_xml(f'raw/day_ahead_price/price_{year}_{i}.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-3:publicationdocument:7:3'}) 
       data_xml = pd.concat([data_xml,df], axis=0, ignore_index=True) 
    return data_xml

In [7]:
def read_price(start_year = 2019, end_year = 2019):
    df_xml = read_price_xml(start_year)
    df= p_c(df_xml)
    print(df.head())
    # if start_year == 2019:
    #     # df.loc['shift1']= df['price'].shift(-1)
    #     x = df['price'].shift(-1)
    #     df.loc['2019-04-01 00:00:00':, 'price'] = data.loc['2019-04-01 00:00:00':, 'shift1']
    return df

In [9]:
df_24 = read_price(2024, 2024)

num of groups: 305
   position  price.amount  price
0         1          0.10   0.10
1         2          0.01   0.01
2         3          0.00   0.00
3         4         -0.01  -0.01
4         5         -0.03  -0.03


In [11]:
df_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   position      7320 non-null   int64  
 1   price.amount  7275 non-null   float64
 2   price         7320 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 171.7 KB


In [97]:
df_fin = pd.concat([df_19_fixed,df_20_fixed, df_21_fixed, df_22_fixed, df_23_fixed], axis=0, ignore_index=True) 

In [98]:
df_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   position      43824 non-null  int64  
 1   price.amount  43500 non-null  float64
 2   price         43824 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.0 MB


In [12]:
# Generate an hourly DatetimeIndex from 2019-01-01 to 2021-12-31
date_index = pd.date_range(start="2024-01-01 01:00:00", end="2024-11-01 00:00:00", freq="h")

# Set the generated DatetimeIndex as the index for the DataFrame
df_24.index = date_index

In [14]:
df_24.tail()

Unnamed: 0,position,price.amount,price
2024-10-31 20:00:00,20,131.1,131.1
2024-10-31 21:00:00,21,110.0,110.0
2024-10-31 22:00:00,22,107.0,107.0
2024-10-31 23:00:00,23,107.4,107.4
2024-11-01 00:00:00,24,101.72,101.72


In [16]:
df_24.loc['2024-01-20']

Unnamed: 0,position,price.amount,price
2024-01-20 00:00:00,24,64.75,64.75
2024-01-20 01:00:00,1,66.0,66.0
2024-01-20 02:00:00,2,67.51,67.51
2024-01-20 03:00:00,3,63.54,63.54
2024-01-20 04:00:00,4,61.82,61.82
2024-01-20 05:00:00,5,61.85,61.85
2024-01-20 06:00:00,6,61.75,61.75
2024-01-20 07:00:00,7,62.59,62.59
2024-01-20 08:00:00,8,66.9,66.9
2024-01-20 09:00:00,9,74.82,74.82


In [17]:
df_24.to_pickle('output/df_price_24.pkl')

TODO: 23data

- {'start': '2022-03-26T23:00Z', 'end': '2022-03-27T22:00Z'}
- {'start': '2022-10-29T22:00Z', 'end': '2022-10-30T23:00Z'}
- {'start': '2023-03-25T23:00Z', 'end': '2023-03-26T22:00Z'}
- {'start': '2023-10-28T22:00Z', 'end': '2023-10-29T23:00Z'}
- {'start': '2024-03-30T23:00Z', 'end': '2024-03-31T22:00Z'}

### Forecast Energy Generation

#### Solar B16

In [18]:
date_series_19 = pd.date_range(start='2019-01-01 01:00', end='2020-01-01 00:00', freq='1h')
date_series_20 = pd.date_range(start='2020-01-01 01:00', end='2021-01-01 00:00', freq='1h')
date_series_21 = pd.date_range(start='2021-01-01 01:00', end='2022-01-01 00:00', freq='1h')
date_series_22 = pd.date_range(start='2022-01-01 01:00', end='2023-01-01 00:00', freq='1h')
date_series_23 = pd.date_range(start='2023-01-01 01:00', end='2024-01-01 00:00', freq='1h')
date_series_24 = pd.date_range(start='2024-01-01 01:00', end='2024-11-01 00:00', freq='1h')

In [5]:
# t = date_series_23[date_series_23 < pd.to_datetime('2023-11-14') and date_series_23 > pd.to_datetime('2023-11-14')]
# t = date_series_23[(date_series_23 < pd.to_datetime('2023-11-13 23:30:00')) | (date_series_23 > pd.to_datetime('2023-11-14 23:30:00'))]

In [19]:
def min2h(df_xml, date_series):
    df = df_xml.copy()
    df.drop('position', axis = 1, inplace = True)
    # df.rename(columns={'quantity': col_name}, inplace=True)
    df_hourly_data = df.groupby(df.index // 4).sum()
    df_hourly_data['Date'] = date_series
    df_hourly_data.set_index('Date', inplace= True)
    # df_hourly_data.rename(columns={'quantity': 'Alpha'}, inplace=True)
    return df_hourly_data

In [60]:
# def min2h_23(df_xml, date_series):
#     df = df_xml.copy()
#     df.drop('position', axis = 1, inplace = True)
#     # df.rename(columns={'quantity': col_name}, inplace=True)
#     df_hourly_data = df.groupby(df.index // 4).sum()
#     # Remove the specific date
#     # filtered_date_series = date_series[date_series < pd.to_datetime('2023-11-14') and date_series > pd.to_datetime('2023-11-14')]
#     filtered_date_series = date_series_23[(date_series_23 < pd.to_datetime('2023-11-13 23:30:00')) | (date_series_23 > pd.to_datetime('2023-11-14 23:30:00'))]
#     df_hourly_data['Date'] = filtered_date_series
#     df_hourly_data.set_index('Date', inplace= True)
#     return df_hourly_data

In [20]:
def missing_1114(df):
    
    df_1113 = df[df.index.date == pd.to_datetime('2023-11-13').date()]
    df_1115 = df[df.index.date == pd.to_datetime('2023-11-15').date()]
    df_1114_avg= (df_1113['quantity'].values +  df_1115['quantity'].values) / 2
    df_1114_avg = np.array([int(x) for x in df_1114_avg])
    df_1114_avg = df_1114_avg.reshape(24,1)
    df[df.index.date == pd.to_datetime('2023-11-14').date()] = df_1114_avg
    return df

## Energy Generation

In [24]:
def gene(folder,start_year, end_year,file_name):
    data = pd.DataFrame()
    for year in range(start_year, end_year +1 ):
        print(year)
        if year == 24:
            date_series = pd.date_range(start='2024-01-01 01:00', end='2024-11-01 00:00', freq='1h')
        else:
            date_series = pd.date_range(start=f'20{year}-01-01 01:00', end=f'20{year+1}-01-01 00:00', freq='1h')
        # print(date_series[-5:])
        df_xml = pd.read_xml(f'raw/day_ahead_fore/{folder}/{folder}_{year}.xml', xpath=".//ns:Point", namespaces={'ns': 'urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0'})
        print(len(df_xml))
        df_hourly = min2h(df_xml, date_series)
        print(df_hourly.tail())
        if year == 23 and folder != 'total_load':
            print('Fill missing 2023-11-14')
            df_hourly = missing_1114(df_hourly)
        data = pd.concat([data,df_hourly], axis=0) 
    data.to_pickle(f'output/df_{folder}_{file_name}.pkl')
    return data

In [25]:
features = ['onshore', 'offshore', 'solar', 'total_load']

In [27]:
for i in features:
    print('Data:',i )
    gene(i, 24,24, 24)

Data: onshore
24
29280
Data: offshore
24
29280
Data: solar
24
29280
Data: total_load
24
29280


## All

In [28]:
df_price = pd.read_pickle('output/df_price_24.pkl')
df_price.drop(columns= ['position', 'price.amount'], inplace = True)
df_solar = pd.read_pickle('output/df_solar_24.pkl')
df_onshore = pd.read_pickle('output/df_onshore_24.pkl')
df_offshore = pd.read_pickle('output/df_offshore_24.pkl')
df_total_load = pd.read_pickle('output/df_total_load_24.pkl')

In [29]:
df_price.columns = ['price']
df_solar.columns = ['solar']
df_onshore.columns = ['wind_onshore']
df_offshore.columns = ['wind_offshore']
df_total_load.columns = ['total_load']

In [30]:
df_fore = pd.concat([df_price, df_solar, df_onshore, df_offshore, df_total_load], axis=1)

In [31]:
df_fore.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7320 entries, 2024-01-01 01:00:00 to 2024-11-01 00:00:00
Freq: h
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          7320 non-null   float64
 1   solar          7320 non-null   int64  
 2   wind_onshore   7320 non-null   int64  
 3   wind_offshore  7320 non-null   int64  
 4   total_load     7320 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 343.1 KB


In [33]:
df_fore.loc['2024-03-31 21:00:00': '2024-04-01 03:00:00']

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load
2024-03-31 21:00:00,65.2,0,5599,2896,47101
2024-03-31 22:00:00,60.55,0,6905,3428,45068
2024-03-31 23:00:00,54.9,0,7862,3955,43612
2024-04-01 00:00:00,54.9,0,7937,3278,46914
2024-04-01 01:00:00,51.03,0,7980,3308,40933
2024-04-01 02:00:00,48.98,0,7578,3324,40605
2024-04-01 03:00:00,45.1,0,6265,2918,40924


In [35]:
df_fore.loc['2024-10-27 01:00:00': '2024-10-27 06:00:00']

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load
2024-10-27 01:00:00,91.56,0,1655,952,42509
2024-10-27 02:00:00,84.0,0,1447,867,41207
2024-10-27 03:00:00,82.23,0,1662,888,41207
2024-10-27 04:00:00,80.43,0,1969,927,40738
2024-10-27 05:00:00,81.12,0,2170,974,40906
2024-10-27 06:00:00,77.08,0,2011,994,41250


In [36]:
df_fore.to_pickle('output/df_24.pkl')

In [None]:
# 2019: Sun, Mar 31, 2019 – Sun, Oct 27, 2019

## cleaning

In [37]:
data = pd.read_pickle('output/df_24.pkl')

In [38]:
data.head()

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load
2024-01-01 01:00:00,0.1,0,18504,5617,42649
2024-01-01 02:00:00,0.01,0,18289,5578,41317
2024-01-01 03:00:00,0.0,0,18030,5552,40696
2024-01-01 04:00:00,-0.01,0,17755,5529,40102
2024-01-01 05:00:00,-0.03,0,17335,5559,40371


In [39]:
def price_cleaner(info):
    data.loc[:,'shift1'] = data['price'].shift(-1)
    for start_date, end_date, value in info:
        print(start_date, end_date)
        data.loc[start_date : end_date, 'price'] = data.loc[start_date : end_date, 'shift1']
        data.loc[end_date, 'price'] = value
    data.drop(columns = ['shift1'],inplace = True)
    return data

In [361]:
info = [
    # start, dat light saving +1 day, 
    ['2019-04-01 00:00:00', '2019-10-28 00:00:00', 25.82],
    ['2020-03-30 00:00:00', '2020-10-26 00:00:00', 34.75],
    ['2021-03-29 00:00:00', '2021-11-01 00:00:00', 59.34],
    ['2022-03-28 00:00:00', '2022-10-31 00:00:00', 109.26],
    ['2023-03-27 00:00:00', '2023-10-30 00:00:00', 38.47],
]
info_24 = ['2024-04-01 00:00:00', '2024-10-28 00:00:00', 102.99]
data_cleaned = price_cleaner(info)

2019-04-01 00:00:00 2019-10-28 00:00:00
2020-03-30 00:00:00 2020-10-26 00:00:00
2021-03-29 00:00:00 2021-11-01 00:00:00
2022-03-28 00:00:00 2022-10-31 00:00:00
2023-03-27 00:00:00 2023-10-30 00:00:00


In [363]:
# data_cleaned.loc['2023-10-29 22:00:00': '2023-10-30 02:00:00']

Unnamed: 0,price,solar,wind_onshore,wind_offshore,total_load
2023-10-29 22:00:00,54.27,0,15805,6715,49518
2023-10-29 23:00:00,52.0,0,15103,6569,48285
2023-10-30 00:00:00,38.47,0,14484,6538,46748
2023-10-30 01:00:00,8.0,0,20160,6959,43967
2023-10-30 02:00:00,8.13,0,19183,6981,43740


In [364]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2019-01-01 01:00:00 to 2024-01-01 00:00:00
Freq: h
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          43824 non-null  float64
 1   solar          43824 non-null  int64  
 2   wind_onshore   43824 non-null  int64  
 3   wind_offshore  43824 non-null  int64  
 4   total_load     43824 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 3.0 MB


In [365]:
data_cleaned.to_pickle('output/df_22_23_cleaned.pkl')