# Feature Engineering - 16-dimensional State Vector
Build the complete feature set from market data according to features.yaml


In [1]:
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
from datetime import datetime
import yfinance as yf


## Load Configuration


In [2]:
# Load features configuration
with open('../config/features.yaml', 'r') as f:
    features_config = yaml.safe_load(f)

print("Features to generate:")
for i, feat in enumerate(features_config['features'], 1):
    print(f"{i:2d}. {feat['key']:25s} - {feat['desc']}")
    
print(f"\nWindows config: {features_config['windows']}")
print(f"Normalization: {features_config['normalization']}")


Features to generate:
 1. k                         - Hour of day
 2. weekday                   - Day of week (0=Mon..6=Sun)
 3. season                    - Season (0=Winter,1=Spring,2=Summer,3=Fall)
 4. price_em                  - Day-ahead energy price (HUF/MWh)
 5. price_as                  - Reserve/ancillary price (HUF/MW/h)
 6. p_res_total               - RES available generation to AC bus (MW)
 7. soc                       - Battery state of charge (0..1)
 8. dod                       - DOD within current day-part (Eq.18)
 9. price_em_max_morning      - Max energy price in morning window
10. price_em_max_evening      - Max energy price in evening window
11. k_em_max_morning          - Hour index of morning max
12. k_em_max_evening          - Hour index of evening max
13. price_em_min              - Daily minimum energy price
14. k_em_min                  - Hour index of daily minimum price
15. price_as_min              - Daily min reserve price
16. price_as_max              - Da

## Load Data Sources


# Load reserve data


In [3]:
# Reserve API
%run 01_reserve_data_exploration.ipynb


Helper function loaded
Fetching aFRR Daily data for 4 seasonal weeks in 2025...

Week 1/4: 20250108 to 20250115
Fetching 202501080000 to 202501150000...
  ✓ Got 9747 bytes
  ✓ Week 1 data: 9747 bytes

Week 2/4: 20250408 to 20250415
Fetching 202504080000 to 202504150000...
  ✓ Got 10187 bytes
  ✓ Week 2 data: 10187 bytes

Week 3/4: 20250610 to 20250617
Fetching 202506100000 to 202506170000...
  ✓ Got 9653 bytes
  ✓ Week 3 data: 9653 bytes

Week 4/4: 20251007 to 20251014
Fetching 202510070000 to 202510140000...
  ✓ Got 25855 bytes
  ✓ Week 4 data: 25855 bytes

Combining all weeks...
✓ Total combined aFRR Daily data: 55292 bytes
Merged 21 XML files

Processing 0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCING_RESERVES_UNDER_CONTRACT_R3_202501072300-202501152300.xml...

Processing 0_week1_001-AMOUNT_AND_PRICES_PAID_OF_BALANCING_RESERVES_UNDER_CONTRACT_R3_202504072200-202504152200.xml...

Processing 0_week2_001-AMOUNT_AND_PRICES_PAID_OF_BALANCING_RESERVES_UNDER_CONTRACT_R3_202506092200-202506

Unnamed: 0,file,xml
0,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Balan..."
1,0_week1_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Balan..."
2,0_week2_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Balan..."
3,0_week3_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Balan..."
4,0_week3_002-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Balan..."


Imported LET from lxml.etree
(990, 39)


Unnamed: 0,doc.mRID,doc.revisionNumber,doc.type,doc.process.processType,doc.sender_MarketParticipant.mRID,doc.sender_MarketParticipant.marketRole.type,doc.receiver_MarketParticipant.mRID,doc.receiver_MarketParticipant.marketRole.type,doc.createdDateTime,doc.area_Domain.mRID,...,ts.currency_Unit.name,ts.quantity_Measure_Unit.name,ts.curveType,per.start,per.end,per.resolution,pt.position,pt.quantity,pt.procurement_Price.amount,pt.imbalance_Price.category
0,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,HUF,MAW,A03,2025-01-07T23:00Z,2025-01-08T23:00Z,PT15M,1,55,101,A06
1,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,HUF,MAW,A03,2025-01-07T23:00Z,2025-01-08T23:00Z,PT15M,5,55,103,A06
2,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,HUF,MAW,A03,2025-01-07T23:00Z,2025-01-08T23:00Z,PT15M,9,55,105,A06
3,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,HUF,MAW,A03,2025-01-07T23:00Z,2025-01-08T23:00Z,PT15M,13,55,100,A06
4,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,HUF,MAW,A03,2025-01-07T23:00Z,2025-01-08T23:00Z,PT15M,21,55,114,A06


points_full_df combined: shape=(3707, 43), columns=43


Unnamed: 0,doc.mRID,doc.revisionNumber,doc.type,doc.process.processType,doc.sender_MarketParticipant.mRID,doc.sender_MarketParticipant.marketRole.type,doc.receiver_MarketParticipant.mRID,doc.receiver_MarketParticipant.marketRole.type,doc.createdDateTime,doc.area_Domain.mRID,...,per.end,per.resolution,pt.position,pt.quantity,pt.procurement_Price.amount,pt.imbalance_Price.category,file,doc.allocationDecision_DateAndOrTime.dateTime,doc.original_MarketProduct.marketProductType,ts.original_MarketProduct.marketProductType
0,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,2025-01-08T23:00Z,PT15M,1,55,101,A06,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,,,
1,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,2025-01-08T23:00Z,PT15M,5,55,103,A06,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,,,
2,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,2025-01-08T23:00Z,PT15M,9,55,105,A06,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,,,
3,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,2025-01-08T23:00Z,PT15M,13,55,100,A06,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,,,
4,bac1b7be214e44bb81096ea57b06e2f9,1,A81,A51,10X1001A1001A450,A32,10X1001A1001A450,A33,2025-12-17T10:26:01Z,10YHU-MAVIR----U,...,2025-01-08T23:00Z,PT15M,21,55,114,A06,0_week0_001-AMOUNT_AND_PRICES_PAID_OF_BALANCIN...,,,


DataFrame shape: (43, 1)


Unnamed: 0,n_unique
pt.procurement_Price.amount,1614
pt.quantity,180
ts.mRID,69
per.end,32
per.start,32
pt.position,24
file,21
doc.mRID,21
doc.allocationDecision_DateAndOrTime.dateTime,17
doc.start,11


DataFrame shape: (13, 1)


Unnamed: 0,n_unique
pt.procurement_Price.amount,1614
time_end_dt,756
time_dt,756
pt.quantity,180
per.start_dt,32
per.end_dt,32
pt.position,24
ts.mktPSRType.psrType,3
ts.flowDirection.direction,2
per.resolution,1


DataFrame shape: (13, 1)


Unnamed: 0,n_unique
pt.procurement_Price.amount,855
time_end_dt,737
time_dt,737
pt.quantity,133
per.start_dt,32
per.end_dt,32
pt.position,24
ts.flowDirection.direction,2
per.resolution,1
ts.mktPSRType.psrType,1


In [4]:
## Fix Reserve Data
# 1. Replace ffill NaN
# 2. Create complete hourly time range with missing hours as NaN
# 3. Average prices when there are 2 flow directions in the same hour


In [5]:
# Step 1: Extract reserve data with all relevant columns (commented out)
reserve_df_raw = points_full_df_test[[
    'time_dt',
    'ts.mktPSRType.psrType',
    'pt.procurement_Price.amount',
    'ts.flowDirection.direction'
]].copy()

reserve_df_raw = reserve_df_raw.rename(columns={
    'time_dt': 'datetime',
    'ts.mktPSRType.psrType': 'psr_type',
    'pt.procurement_Price.amount': 'reserve_price',
    'ts.flowDirection.direction': 'flow_direction'
})

# Convert price to numeric
reserve_df_raw['reserve_price'] = pd.to_numeric(reserve_df_raw['reserve_price'], errors='coerce')

# Ensure timezone awareness
if reserve_df_raw['datetime'].dt.tz is None:
    reserve_df_raw['datetime'] = reserve_df_raw['datetime'].dt.tz_localize('Europe/Budapest')

print(f"Raw reserve data: {len(reserve_df_raw)} records")
print(f"Date range: {reserve_df_raw['datetime'].min()} to {reserve_df_raw['datetime'].max()}")
print(f"PSR types: {reserve_df_raw['psr_type'].unique()}")
print(f"Flow directions: {reserve_df_raw['flow_direction'].unique()}")
reserve_df_raw.head(10)


Raw reserve data: 1289 records
Date range: 2025-01-08 00:00:00+01:00 to 2025-10-14 23:00:00+02:00
PSR types: ['A03']
Flow directions: ['A01' 'A02']


Unnamed: 0,datetime,psr_type,reserve_price,flow_direction
0,2025-01-08 00:00:00+01:00,A03,101,A01
1,2025-01-08 01:00:00+01:00,A03,103,A01
2,2025-01-08 02:00:00+01:00,A03,105,A01
3,2025-01-08 03:00:00+01:00,A03,100,A01
4,2025-01-08 05:00:00+01:00,A03,114,A01
5,2025-01-08 06:00:00+01:00,A03,149,A01
6,2025-01-08 07:00:00+01:00,A03,549,A01
7,2025-01-08 08:00:00+01:00,A03,2051,A01
8,2025-01-08 09:00:00+01:00,A03,2925,A01
9,2025-01-08 10:00:00+01:00,A03,2734,A01


In [6]:
# Step 2: Forward-fill NaN values (no longer replacing zero with NaN)
print(f"NaN prices before forward fill: {reserve_df_raw['reserve_price'].isna().sum()}")
reserve_df_raw['reserve_price'] = reserve_df_raw['reserve_price'].ffill()
print(f"NaN prices after forward fill: {reserve_df_raw['reserve_price'].isna().sum()}")
print()

# Step 3: Average prices when there are multiple flow directions in the same hour
# Group by datetime, psr_type and average across flow directions
reserve_df_hourly = reserve_df_raw.groupby(['datetime', 'psr_type'], as_index=False).agg({
    'reserve_price': 'mean',  # Average across flow directions
    'flow_direction': lambda x: ','.join(x.unique())  # Keep track of which directions were averaged
})

print(f"After averaging flow directions: {len(reserve_df_hourly)} records")
print(f"Sample with multiple flow directions:")
multi_flow = reserve_df_hourly[reserve_df_hourly['flow_direction'].str.contains(',', na=False)]
if len(multi_flow) > 0:
    print(multi_flow.head(10))
else:
    print("No hours with multiple flow directions found")
print()

reserve_df_hourly.head(10)


NaN prices before forward fill: 0
NaN prices after forward fill: 0

After averaging flow directions: 737 records
Sample with multiple flow directions:
                    datetime psr_type  reserve_price flow_direction
0  2025-01-08 00:00:00+01:00      A03           50.5        A01,A02
5  2025-01-08 06:00:00+01:00      A03          164.5        A01,A02
6  2025-01-08 07:00:00+01:00      A03          530.0        A01,A02
7  2025-01-08 08:00:00+01:00      A03         1594.5        A01,A02
8  2025-01-08 09:00:00+01:00      A03         2060.5        A01,A02
9  2025-01-08 10:00:00+01:00      A03         1894.5        A01,A02
10 2025-01-08 11:00:00+01:00      A03         1819.0        A01,A02
11 2025-01-08 12:00:00+01:00      A03         1831.0        A01,A02
12 2025-01-08 13:00:00+01:00      A03         1439.0        A01,A02
13 2025-01-08 14:00:00+01:00      A03          872.5        A01,A02



Unnamed: 0,datetime,psr_type,reserve_price,flow_direction
0,2025-01-08 00:00:00+01:00,A03,50.5,"A01,A02"
1,2025-01-08 01:00:00+01:00,A03,103.0,A01
2,2025-01-08 02:00:00+01:00,A03,105.0,A01
3,2025-01-08 03:00:00+01:00,A03,100.0,A01
4,2025-01-08 05:00:00+01:00,A03,114.0,A01
5,2025-01-08 06:00:00+01:00,A03,164.5,"A01,A02"
6,2025-01-08 07:00:00+01:00,A03,530.0,"A01,A02"
7,2025-01-08 08:00:00+01:00,A03,1594.5,"A01,A02"
8,2025-01-08 09:00:00+01:00,A03,2060.5,"A01,A02"
9,2025-01-08 10:00:00+01:00,A03,1894.5,"A01,A02"


In [7]:
# Step 4: Create complete hourly time range and forward-fill missing hours

min_date = reserve_df_hourly['datetime'].min()
max_date = reserve_df_hourly['datetime'].max()

complete_hours = pd.date_range(
    start=min_date.floor('H'),
    end=max_date.ceil('H'),
    freq='H',
    tz='Europe/Budapest'
)

print(f"Original datetime range: {min_date} to {max_date}")
print(f"Complete hourly range: {complete_hours[0]} to {complete_hours[-1]}")
print(f"Total hours in range: {len(complete_hours)}")
print(f"Original records: {len(reserve_df_hourly)}")
print()

psr_types = reserve_df_hourly['psr_type'].unique()
print(f"PSR types: {psr_types}")
print()

complete_template = pd.DataFrame([
    {'datetime': dt, 'psr_type': psr}
    for dt in complete_hours
    for psr in psr_types
])

print(f"Complete template: {len(complete_template)} records ({len(complete_hours)} hours × {len(psr_types)} PSR types)")

reserve_df_complete = complete_template.merge(
    reserve_df_hourly[['datetime', 'psr_type', 'reserve_price']],
    on=['datetime', 'psr_type'],
    how='left'
)

# Forward fill missing values by psr_type (so each PSR type is forward filled independently)
reserve_df_complete = reserve_df_complete.sort_values(['psr_type', 'datetime']).reset_index(drop=True)
reserve_df_complete['reserve_price'] = reserve_df_complete.groupby('psr_type')['reserve_price'].ffill()

print(f"After adding and forward filling missing hours: {len(reserve_df_complete)} records")
print(f"NaN prices (missing hours): {reserve_df_complete['reserve_price'].isna().sum()}")
print(f"Non-NaN prices: {reserve_df_complete['reserve_price'].notna().sum()}")
print()

print("Sample of complete data (showing possibly some NaN at the start of each psr_type):")
reserve_df_complete.head(20)


Original datetime range: 2025-01-08 00:00:00+01:00 to 2025-10-14 23:00:00+02:00
Complete hourly range: 2025-01-08 00:00:00+01:00 to 2025-10-14 23:00:00+02:00
Total hours in range: 6719
Original records: 737

PSR types: ['A03']

Complete template: 6719 records (6719 hours × 1 PSR types)
After adding and forward filling missing hours: 6719 records
NaN prices (missing hours): 0
Non-NaN prices: 6719

Sample of complete data (showing possibly some NaN at the start of each psr_type):


  start=min_date.floor('H'),
  end=max_date.ceil('H'),
  complete_hours = pd.date_range(


Unnamed: 0,datetime,psr_type,reserve_price
0,2025-01-08 00:00:00+01:00,A03,50.5
1,2025-01-08 01:00:00+01:00,A03,103.0
2,2025-01-08 02:00:00+01:00,A03,105.0
3,2025-01-08 03:00:00+01:00,A03,100.0
4,2025-01-08 04:00:00+01:00,A03,100.0
5,2025-01-08 05:00:00+01:00,A03,114.0
6,2025-01-08 06:00:00+01:00,A03,164.5
7,2025-01-08 07:00:00+01:00,A03,530.0
8,2025-01-08 08:00:00+01:00,A03,1594.5
9,2025-01-08 09:00:00+01:00,A03,2060.5


In [8]:
# Step 5: Pivot by PSR type for easier merging with other data
reserve_df = reserve_df_complete.pivot(
    index='datetime',
    columns='psr_type',
    values='reserve_price'
).reset_index()

reserve_df.columns = ['datetime'] + [f'reserve_{col}' for col in reserve_df.columns[1:]]

print(f"Final reserve data shape: {reserve_df.shape}")
print(f"Columns: {reserve_df.columns.tolist()}")
print(f"Date range: {reserve_df['datetime'].min()} to {reserve_df['datetime'].max()}")
print(f"Total hours: {len(reserve_df)}")
print()

print("Reserve price statistics:")
for col in reserve_df.columns:
    if col.startswith('reserve_'):
        non_nan = reserve_df[col].notna().sum()
        nan_count = reserve_df[col].isna().sum()
        print(f"{col}:")
        print(f"  Total rows: {len(reserve_df)}")
        print(f"  Non-NaN: {non_nan}")
        print(f"  NaN: {nan_count} ({nan_count/len(reserve_df)*100:.1f}%)")
        if non_nan > 0:
            print(f"  Mean: {reserve_df[col].mean():.2f}")
            print(f"  Min: {reserve_df[col].min():.2f}")
            print(f"  Max: {reserve_df[col].max():.2f}")
        print()

print("\nFirst 24 hours (showing NaN values):")
reserve_df.head(24)


Final reserve data shape: (6719, 2)
Columns: ['datetime', 'reserve_A03']
Date range: 2025-01-08 00:00:00+01:00 to 2025-10-14 23:00:00+02:00
Total hours: 6719

Reserve price statistics:
reserve_A03:
  Total rows: 6719
  Non-NaN: 6719
  NaN: 0 (0.0%)
  Mean: 311.04
  Min: 0.00
  Max: 14202.00


First 24 hours (showing NaN values):


Unnamed: 0,datetime,reserve_A03
0,2025-01-08 00:00:00+01:00,50.5
1,2025-01-08 01:00:00+01:00,103.0
2,2025-01-08 02:00:00+01:00,105.0
3,2025-01-08 03:00:00+01:00,100.0
4,2025-01-08 04:00:00+01:00,100.0
5,2025-01-08 05:00:00+01:00,114.0
6,2025-01-08 06:00:00+01:00,164.5
7,2025-01-08 07:00:00+01:00,530.0
8,2025-01-08 08:00:00+01:00,1594.5
9,2025-01-08 09:00:00+01:00,2060.5


## Convert Reserve Prices from HUF to EUR

Fetch EUR/HUF exchange rates from yfinance and convert reserve prices to EUR for consistency with day-ahead prices.


In [9]:
# Fetch EUR/HUF exchange rates from yfinance
min_date = reserve_df['datetime'].min().date()
max_date = reserve_df['datetime'].max().date()
print(f"Fetching EUR/HUF exchange rates from {min_date} to {max_date}...")
eurhuf = yf.download('EURHUF=X', start=min_date, end=max_date, progress=False)
fx_rates = eurhuf[['Close']].copy()
fx_rates = fx_rates.reset_index()
fx_rates.columns = ['date', 'eurhuf_rate']
fx_rates['date'] = pd.to_datetime(fx_rates['date']).dt.tz_localize('Europe/Budapest')
print(f"\nFX rates downloaded: {len(fx_rates)} days")
print(f"Date range: {fx_rates['date'].min()} to {fx_rates['date'].max()}")
print(f"EUR/HUF range: {fx_rates['eurhuf_rate'].min():.4f} - {fx_rates['eurhuf_rate'].max():.4f}")
print(f"Average EUR/HUF: {fx_rates['eurhuf_rate'].mean():.4f}")
print()
fx_rates.head(10)


  eurhuf = yf.download('EURHUF=X', start=min_date, end=max_date, progress=False)


Fetching EUR/HUF exchange rates from 2025-01-08 to 2025-10-14...

FX rates downloaded: 197 days
Date range: 2025-01-08 00:00:00+01:00 to 2025-10-13 00:00:00+02:00
EUR/HUF range: 387.7070 - 414.4650
Average EUR/HUF: 400.3095



Unnamed: 0,date,eurhuf_rate
0,2025-01-08 00:00:00+01:00,414.464996
1,2025-01-09 00:00:00+01:00,414.023987
2,2025-01-10 00:00:00+01:00,413.458008
3,2025-01-13 00:00:00+01:00,412.529999
4,2025-01-14 00:00:00+01:00,412.196014
5,2025-01-15 00:00:00+01:00,411.806
6,2025-01-16 00:00:00+01:00,410.725006
7,2025-01-17 00:00:00+01:00,412.041992
8,2025-01-20 00:00:00+01:00,412.389008
9,2025-01-21 00:00:00+01:00,410.845001


In [10]:
# Merge FX rates with reserve data and convert HUF to EUR
reserve_df['date'] = reserve_df['datetime'].dt.date
reserve_df['date'] = pd.to_datetime(reserve_df['date']).dt.tz_localize('Europe/Budapest')
reserve_df = reserve_df.merge(fx_rates, on='date', how='left')
reserve_df['eurhuf_rate'] = reserve_df['eurhuf_rate'].fillna(method='ffill')
print(f"FX rates merged. Missing values after forward fill: {reserve_df['eurhuf_rate'].isna().sum()}")
reserve_cols_huf = [col for col in reserve_df.columns if col.startswith('reserve_')]
print(f"Converting {len(reserve_cols_huf)} reserve price column(s) from HUF to EUR...")
for col in reserve_cols_huf:
    col_eur = col + '_eur'
    reserve_df[col_eur] = reserve_df[col] / reserve_df['eurhuf_rate']
    valid_count = reserve_df[col_eur].notna().sum()
    if valid_count > 0:
        print(f"\n{col} -> {col_eur}:")
        print(f"  Mean (HUF): {reserve_df[col].mean():.2f}")
        print(f"  Mean (EUR): {reserve_df[col_eur].mean():.2f}")
        print(f"  Sample conversion: {reserve_df[col].iloc[0]:.2f} HUF / {reserve_df['eurhuf_rate'].iloc[0]:.4f} = {reserve_df[col_eur].iloc[0]:.2f} EUR")
reserve_df = reserve_df.drop(columns=reserve_cols_huf + ['date'])
rename_dict = {col + '_eur': col for col in reserve_cols_huf}
reserve_df = reserve_df.rename(columns=rename_dict)
print(f"\n✓ Conversion complete. Reserve prices are now in EUR.")
print(f"Final columns: {reserve_df.columns.tolist()}")
reserve_df.tail(10)


FX rates merged. Missing values after forward fill: 0
Converting 1 reserve price column(s) from HUF to EUR...

reserve_A03 -> reserve_A03_eur:
  Mean (HUF): 311.04
  Mean (EUR): 0.77
  Sample conversion: 50.50 HUF / 414.4650 = 0.12 EUR

✓ Conversion complete. Reserve prices are now in EUR.
Final columns: ['datetime', 'eurhuf_rate', 'reserve_A03']


  reserve_df['eurhuf_rate'] = reserve_df['eurhuf_rate'].fillna(method='ffill')


Unnamed: 0,datetime,eurhuf_rate,reserve_A03
6709,2025-10-14 14:00:00+02:00,391.820007,1.120412
6710,2025-10-14 15:00:00+02:00,391.820007,1.393497
6711,2025-10-14 16:00:00+02:00,391.820007,2.42203
6712,2025-10-14 17:00:00+02:00,391.820007,1.877137
6713,2025-10-14 18:00:00+02:00,391.820007,0.565311
6714,2025-10-14 19:00:00+02:00,391.820007,0.0
6715,2025-10-14 20:00:00+02:00,391.820007,1.842683
6716,2025-10-14 21:00:00+02:00,391.820007,0.576795
6717,2025-10-14 22:00:00+02:00,391.820007,0.227145
6718,2025-10-14 23:00:00+02:00,391.820007,0.224593


### Note: Reserve Data Processing

The reserve data (`reserve_df`) has been processed in the cells above with the following transformations:
1. **Zero prices converted to NaN** - Prices of 0 are now NaN 
2. **Missing hours filled** - Complete hourly timeline created with NaN for missing hours
3. **Flow directions averaged** - When multiple flow directions exist in same hour, prices are averaged
4. **Pivoted to wide format** - Data is in wide format with columns like `reserve_A03`, `reserve_A04`, etc.
5. **Currency converted to EUR** - Original HUF prices converted to EUR using daily EUR/HUF rates from yfinance

The `reserve_df` is ready to merge and already timezone-aware (Europe/Budapest). All reserve prices are now in EUR/MW/h.


# Load day-ahead data


In [11]:
# Load day-ahead data from Montel source
%run day_ahead_montel.ipynb


Week 1/4: 20250108 to 20250115 -> 169 rows
Week 2/4: 20250408 to 20250415 -> 169 rows
Week 3/4: 20250610 to 20250617 -> 169 rows
Week 4/4: 20251007 to 20251014 -> 169 rows

Total downloaded: 676 rows from 4 weeks across 2024 seasons
Sample date values:
0                   NaN
1    [08/01/2025 00:00]
2    [08/01/2025 01:00]
3    [08/01/2025 02:00]
4    [08/01/2025 03:00]
Name: Date (CET), dtype: object

First date type: <class 'float'>
First date value: nan

Converted date column to datetime. Date range: 2025-01-08 00:00:00 to 2025-10-13 23:00:00
Failed conversions: 4
Price column converted to float. Data shape: (676, 1)
Index is datetime: True

Sample data:
                     HUNGARY (HU)
Date (CET)                       
NaT                           NaN
2025-01-08 00:00:00         84.48
2025-01-08 01:00:00         75.07
2025-01-08 02:00:00         72.94
2025-01-08 03:00:00         69.99
Dropped 4 rows with NaN prices
Remaining rows: 672
NaN values remaining: 0


In [12]:
dayahead_df = montel_df

In [13]:
dayahead_df.reset_index(inplace=True)
dayahead_df.rename(columns={"HUNGARY (HU)":"price_dayahead", "Date (CET)":"datetime"}, inplace=True)

In [14]:
dayahead_df 

Unnamed: 0,datetime,price_dayahead
0,2025-01-08 00:00:00,84.48
1,2025-01-08 01:00:00,75.07
2,2025-01-08 02:00:00,72.94
3,2025-01-08 03:00:00,69.99
4,2025-01-08 04:00:00,83.14
...,...,...
667,2025-10-13 19:00:00,394.32
668,2025-10-13 20:00:00,336.14
669,2025-10-13 21:00:00,233.39
670,2025-10-13 22:00:00,139.37


## Reserve Data Dummy

The reserve API is temporarily unavailable. We generate a dummy reserve dataset in EUR/MW/h aligned to the day-ahead timestamps. The series uses a deterministic diurnal pattern with mild weekday effects and noise for realism.


In [15]:
# # Build dummy reserve data aligned with day-ahead datetimes (EUR/MW/h)
# # Copy datetimes from day-ahead to ensure perfect alignment
# reserve_df = pd.DataFrame({'datetime': dayahead_df['datetime'].copy()})

# # Ensure timezone alignment with day-ahead
# if dayahead_df['datetime'].dt.tz is not None:
#     if reserve_df['datetime'].dt.tz is None:
#         reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize(dayahead_df['datetime'].dt.tz)
#     elif str(reserve_df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
#         reserve_df['datetime'] = reserve_df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)

# # Deterministic dummy reserve price in EUR/MW/h (seeded for reproducibility)
# rng = np.random.default_rng(42)
# hours = reserve_df['datetime'].dt.hour.to_numpy()
# weekday = reserve_df['datetime'].dt.weekday.to_numpy()
# month = reserve_df['datetime'].dt.month.to_numpy()

# # Mild correlation with day-ahead energy prices
# if 'price_em' in dayahead_df.columns:
#     energy_series = dayahead_df['price_em'].to_numpy(dtype=float)
# else:
#     energy_series = dayahead_df['price_dayahead'].to_numpy(dtype=float)
# emin, emax = np.nanmin(energy_series), np.nanmax(energy_series)
# energy_norm = (energy_series - emin) / (max(emax - emin, 1e-6))  # [0,1]
# energy_norm = np.nan_to_num(energy_norm, nan=0.5)

# # Base diurnal with gentle evening uplift → typical base 0.4–1.3 EUR/MW/h
# base_sin = 0.85 + 0.35 * np.sin((hours - 6) / 24 * 2 * np.pi)        # ~[0.5, 1.2]
# evening_sig = 0.20 / (1.0 + np.exp(-(hours - 17.0) / 1.2))           # adds up to ~0.2 near evening
# _diurnal = base_sin + evening_sig

# # Seasonal/monthly adjustment (mild)
# season_factor = np.ones_like(hours, dtype=float)
# season_factor = np.where(np.isin(month, [12, 1, 2]), 1.10, season_factor)  # winter slight uplift
# season_factor = np.where(np.isin(month, [6, 7, 8]), 0.95, season_factor)   # summer slight discount
# season_factor = np.where(np.isin(month, [9, 10, 11]), 1.05, season_factor) # fall slight uplift

# # Weekday/weekend variation (mild)
# weekday_uplift = np.where(weekday < 5, 1.03, 0.97)

# # Noise (small) and mild energy correlation
# noise = rng.normal(0.0, 0.05, size=len(reserve_df))                    # small base noise
# energy_component = 0.15 * (energy_norm - 0.5)                          # ~[-0.075, +0.075]

# # Combine components → clamp to typical base range 0.4–1.3
# base_price = (_diurnal * season_factor * weekday_uplift) + energy_component + noise
# base_price = np.clip(base_price, 0.4, 1.3)

# # Occasional spikes (~5%) set directly between 3–7 EUR/MW/h
# spike_mask = rng.random(len(reserve_df)) < 0.05
# spike_values = rng.uniform(3.0, 7.0, size=len(reserve_df))
# reserve_price = np.where(spike_mask, spike_values, base_price)

# # Hard clip: 0.2 to 8.0 EUR/MW/h (by HUF cap)
# reserve_price = np.clip(reserve_price, 0.2, 8.0)

# reserve_df['reserve_A03'] = reserve_price.astype(float)

# print(f"Dummy reserve data (EUR/MWh) rows: {len(reserve_df)}")
# print(reserve_df.head(10))


In [16]:
# # Adjustment: double reserve dummy values (prior guidance was half) and re-clip to [0.4, 16.0]
# reserve_df['reserve_A03'] = np.clip(reserve_df['reserve_A03'] * 2.0, 0.4, 16.0)
# print("Adjusted reserve_A03 sample:")
# print(reserve_df.head(10))


# Load production data


In [17]:
# Load real PV production data from data/raw/yearly_production.csv
import pandas as pd

# Read the real production data (assume file is CSV with datetime and production columns)
prod_path = "../data/raw/yearly_production.csv"
yearly_prod = pd.read_csv(prod_path)

# Expect column: ['datetime', 'pv_power_mw'] or similar -- adjust column names if different
# If the first column is not named 'datetime', rename it
if yearly_prod.columns[0] != "datetime":
    yearly_prod.rename(columns={yearly_prod.columns[0]: "datetime"}, inplace=True)

# Fix the misentered year (change 1990 --> 2025)
# If datetime is string, parse
if not np.issubdtype(yearly_prod['datetime'].dtype, np.datetime64):
    yearly_prod['datetime'] = pd.to_datetime(yearly_prod['datetime'])

yearly_prod['datetime'] = yearly_prod['datetime'].apply(
    lambda dt: dt.replace(year=2025) if dt.year == 1990 else dt
)

# If there are still 1990 dates, fix those too (safety)
yearly_prod['datetime'] = yearly_prod['datetime'].apply(
    lambda dt: dt.replace(year=2025) if dt.year != 2025 else dt
)

# Match with date range from dayahead_df -- ensure hourly alignment
match_idx = yearly_prod['datetime'].isin(dayahead_df['datetime'])
production_df_hourly = yearly_prod.loc[match_idx].copy()

# Ensure correct timezone: match to dayahead_df['datetime']
if dayahead_df['datetime'].dt.tz is not None:
    if production_df_hourly['datetime'].dt.tz is None:
        production_df_hourly['datetime'] = production_df_hourly['datetime'].dt.tz_localize('Europe/Budapest')
    elif str(production_df_hourly['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
        production_df_hourly['datetime'] = production_df_hourly['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)
else:
    if production_df_hourly['datetime'].dt.tz is not None:
        production_df_hourly['datetime'] = production_df_hourly['datetime'].dt.tz_localize(None)

# Rename columns to be consistent (should be ['datetime', 'pv_power_mw'] for downstream processing)
production_df_hourly = production_df_hourly[['datetime', 'pv_power_mw']].copy()

print(f"Real production (hourly): {len(production_df_hourly)} rows")
print(production_df_hourly.head(24))


Real production (hourly): 672 rows
               datetime  pv_power_mw
168 2025-01-08 00:00:00     0.000000
169 2025-01-08 01:00:00     0.000000
170 2025-01-08 02:00:00     0.000000
171 2025-01-08 03:00:00     0.000000
172 2025-01-08 04:00:00     0.000000
173 2025-01-08 05:00:00     0.000000
174 2025-01-08 06:00:00     0.000000
175 2025-01-08 07:00:00     0.000000
176 2025-01-08 08:00:00     3.302382
177 2025-01-08 09:00:00     8.092850
178 2025-01-08 10:00:00     8.044490
179 2025-01-08 11:00:00     6.990625
180 2025-01-08 12:00:00     7.333425
181 2025-01-08 13:00:00     8.542492
182 2025-01-08 14:00:00     6.198839
183 2025-01-08 15:00:00     1.327328
184 2025-01-08 16:00:00     0.000000
185 2025-01-08 17:00:00     0.000000
186 2025-01-08 18:00:00     0.000000
187 2025-01-08 19:00:00     0.000000
188 2025-01-08 20:00:00     0.000000
189 2025-01-08 21:00:00     0.000000
190 2025-01-08 22:00:00     0.000000
191 2025-01-08 23:00:00     0.000000


In [18]:
# Load production data - COMMENTED OUT (using dummy data below)
# %run 01_production_data.ipynb


# Merge Data Sources (Hourly Granularity)

All data sources are now at hourly resolution:
- Production: aggregated from 15-min to hourly (mean of 4 intervals)
- Day-ahead prices: native hourly
- Reserve prices: native hourly (with NaN for missing hours)


In [19]:
# Start with production (hourly granularity)
df = production_df_hourly[['datetime', 'pv_power_mw']].copy()

# Ensure dayahead_df is timezone-aware
if dayahead_df['datetime'].dt.tz is None:
    dayahead_df['datetime'] = dayahead_df['datetime'].dt.tz_localize('Europe/Budapest')

# CRITICAL: Ensure df['datetime'] matches dayahead_df['datetime'] timezone exactly
# This fixes the merge error (datetime64[ns] vs datetime64[ns, Europe/Budapest])
if dayahead_df['datetime'].dt.tz is not None:
    # dayahead_df is timezone-aware, make df match
    if df['datetime'].dt.tz is None:
        df['datetime'] = df['datetime'].dt.tz_localize('Europe/Budapest')
    elif str(df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
        df['datetime'] = df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)
else:
    # dayahead_df is naive, make df naive too
    if df['datetime'].dt.tz is not None:
        df['datetime'] = df['datetime'].dt.tz_localize(None)

# Ensure reserve_df datetime also matches
if 'datetime' in reserve_df.columns:
    if dayahead_df['datetime'].dt.tz is not None:
        if reserve_df['datetime'].dt.tz is None:
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize('Europe/Budapest')
        elif str(reserve_df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)
    else:
        if reserve_df['datetime'].dt.tz is not None:
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize(None)

# Merge day-ahead prices (hourly -> exact match)
df = pd.merge(
    df,
    dayahead_df[['datetime', 'price_dayahead']],
    on='datetime',
    how='left'
)

# Merge reserve prices (already pivoted by PSR type, with NaN for missing hours)
df = pd.merge(
    df,
    reserve_df,
    on='datetime',
    how='left'
)

print(f"Merged data: {len(df)} records (hourly)")
print(f"Columns: {df.columns.tolist()}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"\nData completeness:")
print(f"  Production (pv_power_mw): {df['pv_power_mw'].notna().sum()} valid, {df['pv_power_mw'].isna().sum()} NaN")
print(f"  Day-ahead prices: {df['price_dayahead'].notna().sum()} valid, {df['price_dayahead'].isna().sum()} NaN")
for col in df.columns:
    if col.startswith('reserve_'):
        nan_count = df[col].isna().sum()
        valid_count = df[col].notna().sum()
        print(f"  {col}: {valid_count} valid, {nan_count} NaN ({nan_count/len(df)*100:.1f}%)")
        
print("\nFirst 24 hours:")
df.head(24)


Merged data: 672 records (hourly)
Columns: ['datetime', 'pv_power_mw', 'price_dayahead', 'eurhuf_rate', 'reserve_A03']
Date range: 2025-01-08 00:00:00+01:00 to 2025-10-13 23:00:00+02:00

Data completeness:
  Production (pv_power_mw): 672 valid, 0 NaN
  Day-ahead prices: 672 valid, 0 NaN
  reserve_A03: 672 valid, 0 NaN (0.0%)

First 24 hours:


Unnamed: 0,datetime,pv_power_mw,price_dayahead,eurhuf_rate,reserve_A03
0,2025-01-08 00:00:00+01:00,0.0,84.48,414.464996,0.121844
1,2025-01-08 01:00:00+01:00,0.0,75.07,414.464996,0.248513
2,2025-01-08 02:00:00+01:00,0.0,72.94,414.464996,0.253339
3,2025-01-08 03:00:00+01:00,0.0,69.99,414.464996,0.241275
4,2025-01-08 04:00:00+01:00,0.0,83.14,414.464996,0.241275
5,2025-01-08 05:00:00+01:00,0.0,110.43,414.464996,0.275053
6,2025-01-08 06:00:00+01:00,0.0,132.36,414.464996,0.396897
7,2025-01-08 07:00:00+01:00,0.0,158.96,414.464996,1.278757
8,2025-01-08 08:00:00+01:00,3.302382,152.95,414.464996,3.847128
9,2025-01-08 09:00:00+01:00,8.09285,124.37,414.464996,4.971469


In [20]:
df.head(50)

Unnamed: 0,datetime,pv_power_mw,price_dayahead,eurhuf_rate,reserve_A03
0,2025-01-08 00:00:00+01:00,0.0,84.48,414.464996,0.121844
1,2025-01-08 01:00:00+01:00,0.0,75.07,414.464996,0.248513
2,2025-01-08 02:00:00+01:00,0.0,72.94,414.464996,0.253339
3,2025-01-08 03:00:00+01:00,0.0,69.99,414.464996,0.241275
4,2025-01-08 04:00:00+01:00,0.0,83.14,414.464996,0.241275
5,2025-01-08 05:00:00+01:00,0.0,110.43,414.464996,0.275053
6,2025-01-08 06:00:00+01:00,0.0,132.36,414.464996,0.396897
7,2025-01-08 07:00:00+01:00,0.0,158.96,414.464996,1.278757
8,2025-01-08 08:00:00+01:00,3.302382,152.95,414.464996,3.847128
9,2025-01-08 09:00:00+01:00,8.09285,124.37,414.464996,4.971469


## Feature Engineering - Temporal Features (3)


In [21]:
# 1. k - Hour of day (0-23)
df['k'] = df['datetime'].dt.hour

# 2. weekday - Day of week (0=Mon, 6=Sun)
df['weekday'] = df['datetime'].dt.weekday

# 3. season - Season (0=Winter, 1=Spring, 2=Summer, 3=Fall)
def get_season(month):
    if month in [12, 1, 2]:
        return 0  # Winter
    elif month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    else:
        return 3  # Fall

df['season'] = df['datetime'].dt.month.apply(get_season)

print("Temporal features:")
print(df[['datetime', 'k', 'weekday', 'season']].head(10))


Temporal features:
                   datetime  k  weekday  season
0 2025-01-08 00:00:00+01:00  0        2       0
1 2025-01-08 01:00:00+01:00  1        2       0
2 2025-01-08 02:00:00+01:00  2        2       0
3 2025-01-08 03:00:00+01:00  3        2       0
4 2025-01-08 04:00:00+01:00  4        2       0
5 2025-01-08 05:00:00+01:00  5        2       0
6 2025-01-08 06:00:00+01:00  6        2       0
7 2025-01-08 07:00:00+01:00  7        2       0
8 2025-01-08 08:00:00+01:00  8        2       0
9 2025-01-08 09:00:00+01:00  9        2       0


## Feature Engineering - Price & Operations (3 direct + 2 env_state placeholders)


In [22]:
# 4. price_em - Day-ahead energy price (HUF/MWh)
df['price_em'] = df['price_dayahead']

# 5. price_as - Reserve price (use A03 as primary, fallback to others)
if 'reserve_A03' in df.columns:
    df['price_as'] = df['reserve_A03']
elif 'reserve_A04' in df.columns:
    df['price_as'] = df['reserve_A04']
elif 'reserve_A05' in df.columns:
    df['price_as'] = df['reserve_A05']
else:
    df['price_as'] = 0.0

# 6. p_res_total - RES available generation (MW)
df['p_res_total'] = df['pv_power_mw']

# 7. soc - Battery SOC (placeholder, will be filled by environment)
df['soc'] = 0.5  # Initial placeholder

# 8. dod - DOD (placeholder, will be filled by environment)
df['dod'] = 0.0  # Initial placeholder

print("Price & operations features:")
print(df[['datetime', 'price_em', 'price_as', 'p_res_total', 'soc', 'dod']].head())


Price & operations features:
                   datetime  price_em  price_as  p_res_total  soc  dod
0 2025-01-08 00:00:00+01:00     84.48  0.121844          0.0  0.5  0.0
1 2025-01-08 01:00:00+01:00     75.07  0.248513          0.0  0.5  0.0
2 2025-01-08 02:00:00+01:00     72.94  0.253339          0.0  0.5  0.0
3 2025-01-08 03:00:00+01:00     69.99  0.241275          0.0  0.5  0.0
4 2025-01-08 04:00:00+01:00     83.14  0.241275          0.0  0.5  0.0


In [23]:
df

Unnamed: 0,datetime,pv_power_mw,price_dayahead,eurhuf_rate,reserve_A03,k,weekday,season,price_em,price_as,p_res_total,soc,dod
0,2025-01-08 00:00:00+01:00,0.0,84.48,414.464996,0.121844,0,2,0,84.48,0.121844,0.0,0.5,0.0
1,2025-01-08 01:00:00+01:00,0.0,75.07,414.464996,0.248513,1,2,0,75.07,0.248513,0.0,0.5,0.0
2,2025-01-08 02:00:00+01:00,0.0,72.94,414.464996,0.253339,2,2,0,72.94,0.253339,0.0,0.5,0.0
3,2025-01-08 03:00:00+01:00,0.0,69.99,414.464996,0.241275,3,2,0,69.99,0.241275,0.0,0.5,0.0
4,2025-01-08 04:00:00+01:00,0.0,83.14,414.464996,0.241275,4,2,0,83.14,0.241275,0.0,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2025-10-13 19:00:00+02:00,0.0,394.32,391.820007,0.000000,19,0,3,394.32,0.000000,0.0,0.5,0.0
668,2025-10-13 20:00:00+02:00,0.0,336.14,391.820007,1.561942,20,0,3,336.14,1.561942,0.0,0.5,0.0
669,2025-10-13 21:00:00+02:00,0.0,233.39,391.820007,0.245010,21,0,3,233.39,0.245010,0.0,0.5,0.0
670,2025-10-13 22:00:00+02:00,0.0,139.37,391.820007,0.193967,22,0,3,139.37,0.193967,0.0,0.5,0.0


## Feature Engineering - Daily Market Context (8)


In [24]:
# Add date column for grouping
df['date'] = df['datetime'].dt.date

# Window definitions from config (half-day split at hour 12)
morning_hours = range(0, 12)  # [0-11]
evening_hours = range(12, 24) # [12-23]

# Function to compute daily features
def compute_daily_features(group):
    morning_mask = group['k'].isin(morning_hours)
    evening_mask = group['k'].isin(evening_hours)
    
    # Morning max price and hour
    morning_prices = group.loc[morning_mask, 'price_em']
    if len(morning_prices) > 0 and not morning_prices.isna().all():
        group['price_em_max_morning'] = morning_prices.max()
        group['k_em_max_morning'] = group.loc[morning_mask & (group['price_em'] == morning_prices.max()), 'k'].iloc[0]
    else:
        group['price_em_max_morning'] = 0.0
        group['k_em_max_morning'] = 6  # Default morning peak
    
    # Evening max price and hour
    evening_prices = group.loc[evening_mask, 'price_em']
    if len(evening_prices) > 0 and not evening_prices.isna().all():
        group['price_em_max_evening'] = evening_prices.max()
        group['k_em_max_evening'] = group.loc[evening_mask & (group['price_em'] == evening_prices.max()), 'k'].iloc[0]
    else:
        group['price_em_max_evening'] = 0.0
        group['k_em_max_evening'] = 18  # Default evening peak
    
    # Daily min price and hour
    if not group['price_em'].isna().all():
        group['price_em_min'] = group['price_em'].min()
        group['k_em_min'] = group.loc[group['price_em'] == group['price_em'].min(), 'k'].iloc[0]
    else:
        group['price_em_min'] = 0.0
        group['k_em_min'] = 3  # Default night hour
    
    # Daily reserve price min/max
    if not group['price_as'].isna().all():
        group['price_as_min'] = group['price_as'].min()
        group['price_as_max'] = group['price_as'].max()
    else:
        group['price_as_min'] = 0.0
        group['price_as_max'] = 0.0
    
    return group

# Apply daily feature computation
df = df.groupby('date', group_keys=False).apply(compute_daily_features)

print("Daily market context features:")
print(df[['datetime', 'price_em_max_morning', 'k_em_max_morning', 
          'price_em_max_evening', 'k_em_max_evening',
          'price_em_min', 'k_em_min', 'price_as_min', 'price_as_max']].head(24))


Daily market context features:
                    datetime  price_em_max_morning  k_em_max_morning  \
0  2025-01-08 00:00:00+01:00                158.96                 7   
1  2025-01-08 01:00:00+01:00                158.96                 7   
2  2025-01-08 02:00:00+01:00                158.96                 7   
3  2025-01-08 03:00:00+01:00                158.96                 7   
4  2025-01-08 04:00:00+01:00                158.96                 7   
5  2025-01-08 05:00:00+01:00                158.96                 7   
6  2025-01-08 06:00:00+01:00                158.96                 7   
7  2025-01-08 07:00:00+01:00                158.96                 7   
8  2025-01-08 08:00:00+01:00                158.96                 7   
9  2025-01-08 09:00:00+01:00                158.96                 7   
10 2025-01-08 10:00:00+01:00                158.96                 7   
11 2025-01-08 11:00:00+01:00                158.96                 7   
12 2025-01-08 12:00:00+01:00     

  df = df.groupby('date', group_keys=False).apply(compute_daily_features)


In [25]:
## Feature Engineering - Future-aware signals (4)

# Compute per-day future deltas and peak-aware signals
def compute_future_features(group: pd.DataFrame) -> pd.DataFrame:
    prices = group['price_em'].to_numpy(dtype=float)
    k_vals = group['k'].to_numpy(dtype=int)
    n = len(group)

    if n == 0:
        group['max_future_price_delta'] = np.nan
        group['avg_future_price_delta'] = np.nan
        group['peak_hour_price_delta'] = np.nan
        group['time_to_peak_hour'] = 0
        return group

    # Determine daily peak hour directly from the data
    peak_idx = int(np.nanargmax(prices))
    peak_hour = int(k_vals[peak_idx])
    peak_price = float(prices[peak_idx])

    max_future_delta = np.zeros(n, dtype=float)
    avg_future_delta = np.zeros(n, dtype=float)
    peak_hour_delta = np.zeros(n, dtype=float)
    time_to_peak = np.zeros(n, dtype=int)

    for i in range(n):
        if i < n - 1:
            future = prices[i+1:]
            max_future = float(np.nanmax(future))
            avg_future = float(np.nanmean(future))
            max_future_delta[i] = max_future - prices[i]
            avg_future_delta[i] = avg_future - prices[i]
        else:
            # No future information for the last hour
            max_future_delta[i] = 0.0
            avg_future_delta[i] = 0.0
        peak_hour_delta[i] = peak_price - prices[i]
        t = int(k_vals[i])
        time_to_peak[i] = max(peak_hour - t, 0)

    group['max_future_price_delta'] = max_future_delta
    group['avg_future_price_delta'] = avg_future_delta
    group['peak_hour_price_delta'] = peak_hour_delta
    group['time_to_peak_hour'] = time_to_peak
    return group

# Apply per-day future-aware features
df = df.groupby('date', group_keys=False).apply(compute_future_features)

print("Added future-aware features: max_future_price_delta, avg_future_price_delta, peak_hour_price_delta, time_to_peak_hour")
print(df[['datetime','price_em','k','max_future_price_delta','avg_future_price_delta','peak_hour_price_delta','time_to_peak_hour']].head(24))


Added future-aware features: max_future_price_delta, avg_future_price_delta, peak_hour_price_delta, time_to_peak_hour
                    datetime  price_em   k  max_future_price_delta  \
0  2025-01-08 00:00:00+01:00     84.48   0                   74.48   
1  2025-01-08 01:00:00+01:00     75.07   1                   83.89   
2  2025-01-08 02:00:00+01:00     72.94   2                   86.02   
3  2025-01-08 03:00:00+01:00     69.99   3                   88.97   
4  2025-01-08 04:00:00+01:00     83.14   4                   75.82   
5  2025-01-08 05:00:00+01:00    110.43   5                   48.53   
6  2025-01-08 06:00:00+01:00    132.36   6                   26.60   
7  2025-01-08 07:00:00+01:00    158.96   7                   -3.04   
8  2025-01-08 08:00:00+01:00    152.95   8                    2.97   
9  2025-01-08 09:00:00+01:00    124.37   9                   31.55   
10 2025-01-08 10:00:00+01:00    107.36  10                   48.56   
11 2025-01-08 11:00:00+01:00    101.94  11

  df = df.groupby('date', group_keys=False).apply(compute_future_features)


## Create Final Feature Vector (20 dimensions)


In [26]:
# Define features in exact order (extended with 4 future-aware signals)
feature_columns = [
    'k',                      # 1. Hour of day
    'weekday',                # 2. Day of week
    'season',                 # 3. Season
    'price_em',               # 4. Day-ahead price
    'price_as',               # 5. Reserve price
    'p_res_total',            # 6. RES generation
    'soc',                    # 7. Battery SOC (env state)
    'dod',                    # 8. DOD (env state)
    'price_em_max_morning',   # 9. Morning max price
    'price_em_max_evening',   # 10. Evening max price
    'k_em_max_morning',       # 11. Morning max hour
    'k_em_max_evening',       # 12. Evening max hour
    'price_em_min',           # 13. Daily min price
    'k_em_min',               # 14. Daily min hour
    'price_as_min',           # 15. Daily reserve min
    'price_as_max',           # 16. Daily reserve max
    'max_future_price_delta', # 17. Future-aware: max(price[t+1:]) - price[t]
    'avg_future_price_delta', # 18. Future-aware: mean(price[t+1:]) - price[t]
    'peak_hour_price_delta',  # 19. Future-aware: price[peak_hour] - price[t]
    'time_to_peak_hour',      # 20. Future-aware: peak_hour - t (>=0)
]

# Select final features
features_df = df[['datetime'] + feature_columns].copy()

print(f"Final feature vector shape: {features_df.shape}")
print(f"Features ({len(feature_columns)}): {feature_columns}")
print("\nFirst 24 hours:")
print(features_df.head(24))


Final feature vector shape: (672, 21)
Features (20): ['k', 'weekday', 'season', 'price_em', 'price_as', 'p_res_total', 'soc', 'dod', 'price_em_max_morning', 'price_em_max_evening', 'k_em_max_morning', 'k_em_max_evening', 'price_em_min', 'k_em_min', 'price_as_min', 'price_as_max', 'max_future_price_delta', 'avg_future_price_delta', 'peak_hour_price_delta', 'time_to_peak_hour']

First 24 hours:
                    datetime   k  weekday  season  price_em  price_as  \
0  2025-01-08 00:00:00+01:00   0        2       0     84.48  0.121844   
1  2025-01-08 01:00:00+01:00   1        2       0     75.07  0.248513   
2  2025-01-08 02:00:00+01:00   2        2       0     72.94  0.253339   
3  2025-01-08 03:00:00+01:00   3        2       0     69.99  0.241275   
4  2025-01-08 04:00:00+01:00   4        2       0     83.14  0.241275   
5  2025-01-08 05:00:00+01:00   5        2       0    110.43  0.275053   
6  2025-01-08 06:00:00+01:00   6        2       0    132.36  0.396897   
7  2025-01-08 07:00:

In [27]:
# Handle NaN values before normalization
print("Handling NaN values:\n")

# Day-ahead price features: interpolate (linear)
dayahead_features = ['price_em', 'price_em_max_morning', 'price_em_max_evening', 'price_em_min']

print("Day-ahead price features (interpolated):")
for feature in dayahead_features:
    if feature in features_df.columns:
        nan_indices = features_df[features_df[feature].isna()].index
        nan_count = len(nan_indices)
        
        if nan_count > 0:
            # Store original NaN values info
            print(f"\n  {feature}:")
            print(f"    NaN count: {nan_count}")
            print(f"    NaN at indices: {nan_indices.tolist()}")
            print(f"    NaN at hours: {features_df.loc[nan_indices, 'datetime'].tolist()}")
            
            # Interpolate
            features_df[feature] = features_df[feature].interpolate(method='linear', limit_direction='both')
            
            # Show interpolated values
            print(f"    Interpolated values: {features_df.loc[nan_indices, feature].tolist()}")

# Reserve price features: fill with 0 (no market agreement)
reserve_features = ['price_as', 'price_as_min', 'price_as_max']

print("\n\nReserve price features (filled with 0):")
for feature in reserve_features:
    if feature in features_df.columns:
        nan_count = features_df[feature].isna().sum()
        if nan_count > 0:
            print(f"  {feature:25s}: {nan_count} NaN → 0")
            features_df[feature] = features_df[feature].fillna(0)

# Verify no NaN values remain
total_nan = features_df[feature_columns].isna().sum().sum()
print(f"\n{'='*80}")
print(f"✓ Total NaN values after processing: {total_nan}")
print(f"✓ Day-ahead prices: interpolated")
print(f"✓ Reserve prices: filled with 0")
print(f"{'='*80}")

features_df.head(30)


Handling NaN values:

Day-ahead price features (interpolated):


Reserve price features (filled with 0):

✓ Total NaN values after processing: 0
✓ Day-ahead prices: interpolated
✓ Reserve prices: filled with 0


Unnamed: 0,datetime,k,weekday,season,price_em,price_as,p_res_total,soc,dod,price_em_max_morning,...,k_em_max_morning,k_em_max_evening,price_em_min,k_em_min,price_as_min,price_as_max,max_future_price_delta,avg_future_price_delta,peak_hour_price_delta,time_to_peak_hour
0,2025-01-08 00:00:00+01:00,0,2,0,84.48,0.121844,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,74.48,36.375217,74.48,7
1,2025-01-08 01:00:00+01:00,1,2,0,75.07,0.248513,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,83.89,47.866364,83.89,6
2,2025-01-08 02:00:00+01:00,2,2,0,72.94,0.253339,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,86.02,52.377143,86.02,5
3,2025-01-08 03:00:00+01:00,3,2,0,69.99,0.241275,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,88.97,58.0935,88.97,4
4,2025-01-08 04:00:00+01:00,4,2,0,83.14,0.241275,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,75.82,47.308947,75.82,3
5,2025-01-08 05:00:00+01:00,5,2,0,110.43,0.275053,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,48.53,21.131111,48.53,2
6,2025-01-08 06:00:00+01:00,6,2,0,132.36,0.396897,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,26.6,-0.845882,26.6,1
7,2025-01-08 07:00:00+01:00,7,2,0,158.96,1.278757,0.0,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,-3.04,-29.16125,0.0,0
8,2025-01-08 08:00:00+01:00,8,2,0,152.95,3.847128,3.302382,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,2.97,-24.694667,6.01,0
9,2025-01-08 09:00:00+01:00,9,2,0,124.37,4.971469,8.09285,0.5,0.0,158.96,...,7,17,69.99,3,0.121844,4.971469,31.55,4.162857,34.59,0


## Normalization to [-1, 1] Range

Apply min-max normalization to all features following the article's specification.

Formula: `normalized = 2 * (value - min) / (max - min) - 1`

This maps [min, max] → [-1, 1]


In [28]:
# Step 1: Calculate normalization parameters from the dataset
# Skip soc and dod (env_state placeholders)
# Skip datetime (not a feature)

normalization_params = {}

# Fixed ranges for features that should not be data-dependent
fixed_ranges = {
    'k': (0, 23),           # Hour of day always 0-23
    'weekday': (0, 6),      # Day of week always 0-6
    'season': (0, 3),       # Season always 0-3 (even if test data only has one season)
}

features_to_normalize = [
    'k', 'weekday', 'season',                          # Temporal (3)
    'price_em', 'price_as', 'p_res_total',            # Prices & operations (3, skip soc/dod)
    'price_em_max_morning', 'price_em_max_evening',   # Daily context (8)
    'k_em_max_morning', 'k_em_max_evening',
    'price_em_min', 'k_em_min',
    'price_as_min', 'price_as_max',
    # Future-aware signals (4)
    'max_future_price_delta', 'avg_future_price_delta',
    'peak_hour_price_delta', 'time_to_peak_hour'
]

print("Calculating normalization parameters:\n")
print("(* = fixed range, not calculated from data)\n")

for feature in features_to_normalize:
    if feature in fixed_ranges:
        # Use fixed range
        feature_min, feature_max = fixed_ranges[feature]
        source = "*"
    else:
        # Calculate min/max from data, ignoring NaN values
        feature_min = features_df[feature].min()
        feature_max = features_df[feature].max()
        source = " "
    
    normalization_params[feature] = {
        'min': float(feature_min),
        'max': float(feature_max),
        'range': float(feature_max - feature_min)
    }
    
    print(f"{source} {feature:25s}: min={feature_min:12.4f}, max={feature_max:12.4f}, range={feature_max - feature_min:12.4f}")

print(f"\n✓ Calculated normalization parameters for {len(normalization_params)} features")
print(f"✓ Fixed ranges used for: {', '.join(fixed_ranges.keys())}")


Calculating normalization parameters:

(* = fixed range, not calculated from data)

* k                        : min=      0.0000, max=     23.0000, range=     23.0000
* weekday                  : min=      0.0000, max=      6.0000, range=      6.0000
* season                   : min=      0.0000, max=      3.0000, range=      3.0000
  price_em                 : min=    -34.5300, max=    394.3200, range=    428.8500
  price_as                 : min=      0.0000, max=     33.9517, range=     33.9517
  p_res_total              : min=      0.0000, max=     15.0858, range=     15.0858
  price_em_max_morning     : min=     89.7200, max=    280.0600, range=    190.3400
  price_em_max_evening     : min=    145.6300, max=    394.3200, range=    248.6900
  k_em_max_morning         : min=      0.0000, max=      9.0000, range=      9.0000
  k_em_max_evening         : min=     16.0000, max=     21.0000, range=      5.0000
  price_em_min             : min=    -34.5300, max=    100.5000, range=    1

In [29]:
# Step 2: Apply min-max normalization to [-1, 1]
# Formula: normalized = 2 * (value - min) / (max - min) - 1
# Preserve NaN values in reserve prices (no filling)

features_df_normalized = features_df.copy()

print("Applying min-max normalization to [-1, 1]:\n")

for feature in features_to_normalize:
    params = normalization_params[feature]
    min_val = params['min']
    max_val = params['max']
    range_val = params['range']
    
    # Apply normalization formula
    # Handle case where min == max (constant feature)
    if range_val > 0:
        features_df_normalized[feature] = 2 * (features_df[feature] - min_val) / range_val - 1
    else:
        # If constant, map to 0 (center of [-1, 1])
        features_df_normalized[feature] = 0.0
    
    # Show before/after stats
    original_range = f"[{min_val:.4f}, {max_val:.4f}]"
    norm_min = features_df_normalized[feature].min()
    norm_max = features_df_normalized[feature].max()
    norm_range = f"[{norm_min:.4f}, {norm_max:.4f}]"
    nan_count = features_df_normalized[feature].isna().sum()
    
    print(f"{feature:25s}: {original_range:30s} → {norm_range:25s} ({nan_count} NaN)")

print(f"\n✓ Normalization complete. All features scaled to [-1, 1] range.")
print(f"✓ NaN values preserved (no imputation for missing reserve prices)")


Applying min-max normalization to [-1, 1]:

k                        : [0.0000, 23.0000]              → [-1.0000, 1.0000]         (0 NaN)
weekday                  : [0.0000, 6.0000]               → [-1.0000, 1.0000]         (0 NaN)
season                   : [0.0000, 3.0000]               → [-1.0000, 1.0000]         (0 NaN)
price_em                 : [-34.5300, 394.3200]           → [-1.0000, 1.0000]         (0 NaN)
price_as                 : [0.0000, 33.9517]              → [-1.0000, 1.0000]         (0 NaN)
p_res_total              : [0.0000, 15.0858]              → [-1.0000, 1.0000]         (0 NaN)
price_em_max_morning     : [89.7200, 280.0600]            → [-1.0000, 1.0000]         (0 NaN)
price_em_max_evening     : [145.6300, 394.3200]           → [-1.0000, 1.0000]         (0 NaN)
k_em_max_morning         : [0.0000, 9.0000]               → [-1.0000, 1.0000]         (0 NaN)
k_em_max_evening         : [16.0000, 21.0000]             → [-1.0000, 1.0000]         (0 NaN)
price_em_min    

In [30]:
# Step 3: Save normalization parameters for future use
import yaml

norm_params_dict = {
    'normalization': {
        'method': 'minmax',
        'target_range': [-1, 1],
        'formula': '2 * (value - min) / (max - min) - 1'
    },
    'features': {}
}

# Add each feature's parameters
for feature, params in normalization_params.items():
    norm_params_dict['features'][feature] = {
        'min': params['min'],
        'max': params['max'],
        'range': params['range']
    }

# Save to file
output_path = Path('../data/processed/normalization_params.yaml')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    yaml.dump(norm_params_dict, f, default_flow_style=False, sort_keys=False)

print(f"✓ Saved normalization parameters to {output_path}")
print(f"  Total features: {len(normalization_params)}")
print(f"  File size: {output_path.stat().st_size} bytes")


✓ Saved normalization parameters to ..\data\processed\normalization_params.yaml
  Total features: 18
  File size: 1440 bytes


In [31]:
# Step 4: Data Quality Check

print("=" * 80)
print("NORMALIZED FEATURE DATASET - QUALITY REPORT")
print("=" * 80)

print(f"\nDataset shape: {features_df_normalized.shape}")
print(f"Date range: {features_df_normalized['datetime'].min()} to {features_df_normalized['datetime'].max()}")
print(f"Total hours: {len(features_df_normalized)}")

print("\n" + "=" * 80)
print("MISSING VALUES (NaN) PER FEATURE")
print("=" * 80)

for feature in feature_columns:
    nan_count = features_df_normalized[feature].isna().sum()
    nan_pct = (nan_count / len(features_df_normalized)) * 100
    print(f"{feature:25s}: {nan_count:4d} NaN ({nan_pct:5.2f}%)")

print("\n" + "=" * 80)
print("NORMALIZED FEATURE RANGES (should be ≈ [-1, 1])")
print("=" * 80)

for feature in features_to_normalize:
    min_val = features_df_normalized[feature].min()
    max_val = features_df_normalized[feature].max()
    mean_val = features_df_normalized[feature].mean()
    std_val = features_df_normalized[feature].std()
    print(f"{feature:25s}: min={min_val:7.4f}, max={max_val:7.4f}, mean={mean_val:7.4f}, std={std_val:7.4f}")

print("\n" + "=" * 80)
print("SAMPLE DATA (First 24 hours)")
print("=" * 80)
print(features_df_normalized.head(24))


NORMALIZED FEATURE DATASET - QUALITY REPORT

Dataset shape: (672, 21)
Date range: 2025-01-08 00:00:00+01:00 to 2025-10-13 23:00:00+02:00
Total hours: 672

MISSING VALUES (NaN) PER FEATURE
k                        :    0 NaN ( 0.00%)
weekday                  :    0 NaN ( 0.00%)
season                   :    0 NaN ( 0.00%)
price_em                 :    0 NaN ( 0.00%)
price_as                 :    0 NaN ( 0.00%)
p_res_total              :    0 NaN ( 0.00%)
soc                      :    0 NaN ( 0.00%)
dod                      :    0 NaN ( 0.00%)
price_em_max_morning     :    0 NaN ( 0.00%)
price_em_max_evening     :    0 NaN ( 0.00%)
k_em_max_morning         :    0 NaN ( 0.00%)
k_em_max_evening         :    0 NaN ( 0.00%)
price_em_min             :    0 NaN ( 0.00%)
k_em_min                 :    0 NaN ( 0.00%)
price_as_min             :    0 NaN ( 0.00%)
price_as_max             :    0 NaN ( 0.00%)
max_future_price_delta   :    0 NaN ( 0.00%)
avg_future_price_delta   :    0 NaN ( 0.00%)
pe

In [32]:
# Step 5: Save normalized dataset

output_path = Path('../data/processed/training_features_normalized.parquet')
output_path.parent.mkdir(parents=True, exist_ok=True)

features_df_normalized.to_parquet(output_path, index=False)

print("=" * 80)
print("DATASET SAVED")
print("=" * 80)
print(f"\n✓ Saved normalized feature dataset to: {output_path}")
print(f"  File size: {output_path.stat().st_size / 1024:.2f} KB")
print(f"  Shape: {features_df_normalized.shape}")
print(f"  Columns: {len(features_df_normalized.columns)}")
print(f"  - datetime (1)")
print(f"  - features (16)")
print(f"\nFeature order (matches features.yaml):")
for i, feature in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {feature}")

print("\n" + "=" * 80)
print("READY FOR TRAINING")
print("=" * 80)
print("\nNext steps:")
print("  1. Load this dataset in your training script")
print("  2. Use normalization_params.yaml to normalize new/test data")
print("  3. soc and dod placeholders will be replaced by environment during training")
print("  4. NaN values in reserve prices indicate no market agreement at those hours")


DATASET SAVED

✓ Saved normalized feature dataset to: ..\data\processed\training_features_normalized.parquet
  File size: 52.76 KB
  Shape: (672, 21)
  Columns: 21
  - datetime (1)
  - features (16)

Feature order (matches features.yaml):
   1. k
   2. weekday
   3. season
   4. price_em
   5. price_as
   6. p_res_total
   7. soc
   8. dod
   9. price_em_max_morning
  10. price_em_max_evening
  11. k_em_max_morning
  12. k_em_max_evening
  13. price_em_min
  14. k_em_min
  15. price_as_min
  16. price_as_max
  17. max_future_price_delta
  18. avg_future_price_delta
  19. peak_hour_price_delta
  20. time_to_peak_hour

READY FOR TRAINING

Next steps:
  1. Load this dataset in your training script
  2. Use normalization_params.yaml to normalize new/test data
  3. soc and dod placeholders will be replaced by environment during training
  4. NaN values in reserve prices indicate no market agreement at those hours


In [33]:
features_df_normalized

Unnamed: 0,datetime,k,weekday,season,price_em,price_as,p_res_total,soc,dod,price_em_max_morning,...,k_em_max_morning,k_em_max_evening,price_em_min,k_em_min,price_as_min,price_as_max,max_future_price_delta,avg_future_price_delta,peak_hour_price_delta,time_to_peak_hour
0,2025-01-08 00:00:00+01:00,-1.000000,-0.333333,-1.0,-0.444981,-0.992823,-1.0,0.5,0.0,-0.272460,...,0.555556,-0.6,0.548100,-0.818182,-0.813548,-0.894396,-0.088772,0.298916,-0.506543,-0.333333
1,2025-01-08 01:00:00+01:00,-0.913043,-0.333333,-1.0,-0.488866,-0.985361,-1.0,0.5,0.0,-0.272460,...,0.555556,-0.6,0.548100,-0.818182,-0.813548,-0.894396,-0.043716,0.365782,-0.444198,-0.428571
2,2025-01-08 02:00:00+01:00,-0.826087,-0.333333,-1.0,-0.498799,-0.985077,-1.0,0.5,0.0,-0.272460,...,0.555556,-0.6,0.548100,-0.818182,-0.813548,-0.894396,-0.033517,0.392030,-0.430086,-0.523810
3,2025-01-08 03:00:00+01:00,-0.739130,-0.333333,-1.0,-0.512557,-0.985787,-1.0,0.5,0.0,-0.272460,...,0.555556,-0.6,0.548100,-0.818182,-0.813548,-0.894396,-0.019392,0.425293,-0.410541,-0.619048
4,2025-01-08 04:00:00+01:00,-0.652174,-0.333333,-1.0,-0.451230,-0.985787,-1.0,0.5,0.0,-0.272460,...,0.555556,-0.6,0.548100,-0.818182,-0.813548,-0.894396,-0.082356,0.362538,-0.497665,-0.714286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2025-10-13 19:00:00+02:00,0.652174,-1.000000,1.0,1.000000,-1.000000,-1.0,0.5,0.0,0.858779,...,0.555556,0.2,0.880767,-1.000000,-1.000000,-0.987142,-0.723965,-1.000000,-1.000000,-1.000000
668,2025-10-13 20:00:00+02:00,0.739130,-1.000000,1.0,0.728670,-0.907990,-1.0,0.5,0.0,0.858779,...,0.555556,0.2,0.880767,-1.000000,-1.000000,-0.987142,-0.937371,-0.911024,-0.614536,-1.000000
669,2025-10-13 21:00:00+02:00,0.826087,-1.000000,1.0,0.249481,-0.985567,-1.0,0.5,0.0,0.858779,...,0.555556,0.2,0.880767,-1.000000,-1.000000,-0.987142,-0.895571,-0.513320,0.066221,-1.000000
670,2025-10-13 22:00:00+02:00,0.913043,-1.000000,1.0,-0.188994,-0.988574,-1.0,0.5,0.0,0.858779,...,0.555556,0.2,0.880767,-1.000000,-1.000000,-0.987142,-0.533397,-0.019701,0.689138,-1.000000


## Train-Test Split with Seasonal Balance

In [34]:
# Split data into train (22 days) and test (6 days) sets
# Test set must include at least 1 day from each season (Winter, Spring, Summer, Fall)

# Extract unique dates and their seasons
features_df_normalized['date'] = features_df_normalized['datetime'].dt.date
unique_dates = sorted(features_df_normalized['date'].unique())

# Group dates by season
# Season mapping: 0=Winter (Dec,Jan,Feb), 1=Spring (Mar,Apr,May), 2=Summer (Jun,Jul,Aug), 3=Fall (Sep,Oct,Nov)
def get_season_from_date(date):
    month = date.month
    if month in [12, 1, 2]:
        return 0  # Winter
    elif month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    else:
        return 3  # Fall

# Group dates by season
dates_by_season = {0: [], 1: [], 2: [], 3: []}
for date in unique_dates:
    season = get_season_from_date(date)
    dates_by_season[season].append(date)

print("Dates by season:")
season_names = {0: "Winter", 1: "Spring", 2: "Summer", 3: "Fall"}
for season, dates in dates_by_season.items():
    print(f"  {season_names[season]}: {len(dates)} days - {[str(d) for d in sorted(dates)]}")

# Select test days: 1 from each season (4 days) + 2 more = 6 days total
test_dates = []

# Select 1 day from each season (middle of week for each)
for season in [0, 1, 2, 3]:
    if dates_by_season[season]:
        # Take middle day of the week (index 3 out of 7, or closest)
        season_dates = sorted(dates_by_season[season])
        mid_idx = len(season_dates) // 2
        test_dates.append(season_dates[mid_idx])
        print(f"  Selected {season_names[season]} test day: {season_dates[mid_idx]}")

# Add 2 more days (one from Winter, one from Summer for balance)
if len(dates_by_season[0]) > 1:
    # Add another Winter day (skip the one already selected)
    winter_dates = sorted([d for d in dates_by_season[0] if d not in test_dates])
    if winter_dates:
        test_dates.append(winter_dates[len(winter_dates)//2])
        print(f"  Selected additional Winter test day: {winter_dates[len(winter_dates)//2]}")

if len(dates_by_season[2]) > 1:
    # Add another Summer day
    summer_dates = sorted([d for d in dates_by_season[2] if d not in test_dates])
    if summer_dates:
        test_dates.append(summer_dates[len(summer_dates)//2])
        print(f"  Selected additional Summer test day: {summer_dates[len(summer_dates)//2]}")

test_dates = sorted(test_dates)
print(f"\n✓ Selected {len(test_dates)} test days: {[str(d) for d in test_dates]}")

# Verify each season has at least 1 day in test set
test_seasons = [get_season_from_date(d) for d in test_dates]
for season in [0, 1, 2, 3]:
    count = test_seasons.count(season)
    if count == 0:
        print(f"⚠️  WARNING: {season_names[season]} has no test days!")
    else:
        print(f"  ✓ {season_names[season]}: {count} test day(s)")

# Split the dataframe
train_df = features_df_normalized[~features_df_normalized['date'].isin(test_dates)].copy()
test_df = features_df_normalized[features_df_normalized['date'].isin(test_dates)].copy()

# Remove date column (not needed in final dataset)
train_df = train_df.drop(columns=['date'])
test_df = test_df.drop(columns=['date'])

# Verify split
train_dates = sorted(features_df_normalized[~features_df_normalized['date'].isin(test_dates)]['date'].unique())
print(f"\n{'='*80}")
print(f"SPLIT SUMMARY")
print(f"{'='*80}")
print(f"Train set: {len(train_dates)} days × 24 hours = {len(train_df)} rows")
print(f"Test set:  {len(test_dates)} days × 24 hours = {len(test_df)} rows")
print(f"Total:     {len(unique_dates)} days × 24 hours = {len(features_df_normalized)} rows")
print(f"\nTrain dates: {[str(d) for d in train_dates]}")
print(f"Test dates:  {[str(d) for d in test_dates]}")

# Save train and test sets
train_path = Path('../data/processed/training_features_normalized_train.parquet')
test_path = Path('../data/processed/training_features_normalized_test.parquet')

train_path.parent.mkdir(parents=True, exist_ok=True)
test_path.parent.mkdir(parents=True, exist_ok=True)

train_df.to_parquet(train_path, index=False)
test_df.to_parquet(test_path, index=False)

print(f"\n{'='*80}")
print(f"FILES SAVED")
print(f"{'='*80}")
print(f"✓ Train set: {train_path}")
print(f"  Size: {train_path.stat().st_size / 1024:.2f} KB, Shape: {train_df.shape}")
print(f"✓ Test set:  {test_path}")
print(f"  Size: {test_path.stat().st_size / 1024:.2f} KB, Shape: {test_df.shape}")
print(f"\n✓ Train-test split complete with seasonal balance!")


Dates by season:
  Winter: 7 days - ['2025-01-08', '2025-01-09', '2025-01-10', '2025-01-11', '2025-01-12', '2025-01-13', '2025-01-14']
  Spring: 7 days - ['2025-04-08', '2025-04-09', '2025-04-10', '2025-04-11', '2025-04-12', '2025-04-13', '2025-04-14']
  Summer: 7 days - ['2025-06-10', '2025-06-11', '2025-06-12', '2025-06-13', '2025-06-14', '2025-06-15', '2025-06-16']
  Fall: 7 days - ['2025-10-07', '2025-10-08', '2025-10-09', '2025-10-10', '2025-10-11', '2025-10-12', '2025-10-13']
  Selected Winter test day: 2025-01-11
  Selected Spring test day: 2025-04-11
  Selected Summer test day: 2025-06-13
  Selected Fall test day: 2025-10-10
  Selected additional Winter test day: 2025-01-12
  Selected additional Summer test day: 2025-06-14

✓ Selected 6 test days: ['2025-01-11', '2025-01-12', '2025-04-11', '2025-06-13', '2025-06-14', '2025-10-10']
  ✓ Winter: 2 test day(s)
  ✓ Spring: 1 test day(s)
  ✓ Summer: 2 test day(s)
  ✓ Fall: 1 test day(s)

SPLIT SUMMARY
Train set: 22 days × 24 hours = 