# Feature Engineering - 16-dimensional State Vector
Build the complete feature set from market data according to features.yaml


In [1]:
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
from datetime import datetime
import yfinance as yf


## Load Configuration


In [2]:
# Load features configuration
with open('../config/features.yaml', 'r') as f:
    features_config = yaml.safe_load(f)

print("Features to generate:")
for i, feat in enumerate(features_config['features'], 1):
    print(f"{i:2d}. {feat['key']:25s} - {feat['desc']}")
    
print(f"\nWindows config: {features_config['windows']}")
print(f"Normalization: {features_config['normalization']}")


Features to generate:
 1. k                         - Hour of day
 2. weekday                   - Day of week (0=Mon..6=Sun)
 3. season                    - Season (0=Winter,1=Spring,2=Summer,3=Fall)
 4. price_em                  - Day-ahead energy price (HUF/MWh)
 5. price_as                  - Reserve/ancillary price (HUF/MW/h)
 6. p_res_total               - RES available generation to AC bus (MW)
 7. soc                       - Battery state of charge (0..1)
 8. dod                       - DOD within current day-part (Eq.18)
 9. price_em_max_morning      - Max energy price in morning window
10. price_em_max_evening      - Max energy price in evening window
11. k_em_max_morning          - Hour index of morning max
12. k_em_max_evening          - Hour index of evening max
13. price_em_min              - Daily minimum energy price
14. k_em_min                  - Hour index of daily minimum price
15. price_as_min              - Daily min reserve price
16. price_as_max              - Da

## Load Data Sources


# Load reserve data


In [3]:
# Reserve API (commented out; API currently broken)
# %run 01_reserve_data_exploration.ipynb


In [4]:
## Fix Reserve Data
# 1. Replace 0 prices with NaN
# 2. Create complete hourly time range with missing hours as NaN
# 3. Average prices when there are 2 flow directions in the same hour


In [5]:
# Step 1: Extract reserve data with all relevant columns (commented out)
# reserve_df_raw = points_full_df_test[[
#     'time_dt',
#     'ts.mktPSRType.psrType',
#     'pt.procurement_Price.amount',
#     'ts.flowDirection.direction'
# ]].copy()
# 
# reserve_df_raw = reserve_df_raw.rename(columns={
#     'time_dt': 'datetime',
#     'ts.mktPSRType.psrType': 'psr_type',
#     'pt.procurement_Price.amount': 'reserve_price',
#     'ts.flowDirection.direction': 'flow_direction'
# })
# 
# # Convert price to numeric
# reserve_df_raw['reserve_price'] = pd.to_numeric(reserve_df_raw['reserve_price'], errors='coerce')
# 
# # Ensure timezone awareness
# if reserve_df_raw['datetime'].dt.tz is None:
#     reserve_df_raw['datetime'] = reserve_df_raw['datetime'].dt.tz_localize('Europe/Budapest')
# 
# print(f"Raw reserve data: {len(reserve_df_raw)} records")
# print(f"Date range: {reserve_df_raw['datetime'].min()} to {reserve_df_raw['datetime'].max()}")
# print(f"PSR types: {reserve_df_raw['psr_type'].unique()}")
# print(f"Flow directions: {reserve_df_raw['flow_direction'].unique()}")
# reserve_df_raw.head(10)


In [6]:
# Step 2: Replace 0 prices with NaN (commented out)
# print(f"Zero prices before: {(reserve_df_raw['reserve_price'] == 0).sum()}")
# reserve_df_raw.loc[reserve_df_raw['reserve_price'] == 0, 'reserve_price'] = np.nan
# print(f"Zero prices after: {(reserve_df_raw['reserve_price'] == 0).sum()}")
# print(f"NaN prices: {reserve_df_raw['reserve_price'].isna().sum()}")
# print()
# 
# # Step 3: Average prices when there are multiple flow directions in the same hour
# # Group by datetime, psr_type and average across flow directions
# reserve_df_hourly = reserve_df_raw.groupby(['datetime', 'psr_type'], as_index=False).agg({
#     'reserve_price': 'mean',  # Average across flow directions
#     'flow_direction': lambda x: ','.join(x.unique())  # Keep track of which directions were averaged
# })
# 
# print(f"After averaging flow directions: {len(reserve_df_hourly)} records")
# print(f"Sample with multiple flow directions:")
# multi_flow = reserve_df_hourly[reserve_df_hourly['flow_direction'].str.contains(',', na=False)]
# if len(multi_flow) > 0:
#     print(multi_flow.head(10))
# else:
#     print("No hours with multiple flow directions found")
# print()
# 
# reserve_df_hourly.head(10)


In [7]:
# Step 4: Create complete hourly time range and fill missing hours with NaN (commented out)
# min_date = reserve_df_hourly['datetime'].min()
# max_date = reserve_df_hourly['datetime'].max()
# 
# complete_hours = pd.date_range(
#     start=min_date.floor('H'),
#     end=max_date.ceil('H'),
#     freq='H',
#     tz='Europe/Budapest'
# )
# 
# print(f"Original datetime range: {min_date} to {max_date}")
# print(f"Complete hourly range: {complete_hours[0]} to {complete_hours[-1]}")
# print(f"Total hours in range: {len(complete_hours)}")
# print(f"Original records: {len(reserve_df_hourly)}")
# print()
# 
# psr_types = reserve_df_hourly['psr_type'].unique()
# print(f"PSR types: {psr_types}")
# print()
# 
# complete_template = pd.DataFrame([
#     {'datetime': dt, 'psr_type': psr}
#     for dt in complete_hours
#     for psr in psr_types
# ])
# 
# print(f"Complete template: {len(complete_template)} records ({len(complete_hours)} hours × {len(psr_types)} PSR types)")
# 
# reserve_df_complete = complete_template.merge(
#     reserve_df_hourly[['datetime', 'psr_type', 'reserve_price']],
#     on=['datetime', 'psr_type'],
#     how='left'
# )
# 
# print(f"After adding missing hours: {len(reserve_df_complete)} records")
# print(f"NaN prices (missing hours): {reserve_df_complete['reserve_price'].isna().sum()}")
# print(f"Non-NaN prices: {reserve_df_complete['reserve_price'].notna().sum()}")
# print()
# 
# print("Sample of complete data (showing some NaN hours):")
# reserve_df_complete.head(20)


In [8]:
# Step 5: Pivot by PSR type for easier merging with other data (commented out)
# reserve_df = reserve_df_complete.pivot(
#     index='datetime',
#     columns='psr_type',
#     values='reserve_price'
# ).reset_index()
# 
# reserve_df.columns = ['datetime'] + [f'reserve_{col}' for col in reserve_df.columns[1:]]
# 
# print(f"Final reserve data shape: {reserve_df.shape}")
# print(f"Columns: {reserve_df.columns.tolist()}")
# print(f"Date range: {reserve_df['datetime'].min()} to {reserve_df['datetime'].max()}")
# print(f"Total hours: {len(reserve_df)}")
# print()
# 
# print("Reserve price statistics:")
# for col in reserve_df.columns:
#     if col.startswith('reserve_'):
#         non_nan = reserve_df[col].notna().sum()
#         nan_count = reserve_df[col].isna().sum()
#         print(f"{col}:")
#         print(f"  Total rows: {len(reserve_df)}")
#         print(f"  Non-NaN: {non_nan}")
#         print(f"  NaN: {nan_count} ({nan_count/len(reserve_df)*100:.1f}%)")
#         if non_nan > 0:
#             print(f"  Mean: {reserve_df[col].mean():.2f}")
#             print(f"  Min: {reserve_df[col].min():.2f}")
#             print(f"  Max: {reserve_df[col].max():.2f}")
#         print()
# 
# print("\nFirst 24 hours (showing NaN values):")
# reserve_df.head(24)


## Convert Reserve Prices from HUF to EUR

Fetch EUR/HUF exchange rates from yfinance and convert reserve prices to EUR for consistency with day-ahead prices.


In [9]:
# Fetch EUR/HUF exchange rates from yfinance (commented out)
# min_date = reserve_df['datetime'].min().date()
# max_date = reserve_df['datetime'].max().date()
# print(f"Fetching EUR/HUF exchange rates from {min_date} to {max_date}...")
# eurhuf = yf.download('EURHUF=X', start=min_date, end=max_date, progress=False)
# fx_rates = eurhuf[['Close']].copy()
# fx_rates = fx_rates.reset_index()
# fx_rates.columns = ['date', 'eurhuf_rate']
# fx_rates['date'] = pd.to_datetime(fx_rates['date']).dt.tz_localize('Europe/Budapest')
# print(f"\nFX rates downloaded: {len(fx_rates)} days")
# print(f"Date range: {fx_rates['date'].min()} to {fx_rates['date'].max()}")
# print(f"EUR/HUF range: {fx_rates['eurhuf_rate'].min():.4f} - {fx_rates['eurhuf_rate'].max():.4f}")
# print(f"Average EUR/HUF: {fx_rates['eurhuf_rate'].mean():.4f}")
# print()
# fx_rates.head(10)


In [10]:
# Merge FX rates with reserve data and convert HUF to EUR (commented out)
# reserve_df['date'] = reserve_df['datetime'].dt.date
# reserve_df['date'] = pd.to_datetime(reserve_df['date']).dt.tz_localize('Europe/Budapest')
# reserve_df = reserve_df.merge(fx_rates, on='date', how='left')
# reserve_df['eurhuf_rate'] = reserve_df['eurhuf_rate'].fillna(method='ffill')
# print(f"FX rates merged. Missing values after forward fill: {reserve_df['eurhuf_rate'].isna().sum()}")
# reserve_cols_huf = [col for col in reserve_df.columns if col.startswith('reserve_')]
# print(f"Converting {len(reserve_cols_huf)} reserve price column(s) from HUF to EUR...")
# for col in reserve_cols_huf:
#     col_eur = col + '_eur'
#     reserve_df[col_eur] = reserve_df[col] / reserve_df['eurhuf_rate']
#     valid_count = reserve_df[col_eur].notna().sum()
#     if valid_count > 0:
#         print(f"\n{col} -> {col_eur}:")
#         print(f"  Mean (HUF): {reserve_df[col].mean():.2f}")
#         print(f"  Mean (EUR): {reserve_df[col_eur].mean():.2f}")
#         print(f"  Sample conversion: {reserve_df[col].iloc[0]:.2f} HUF / {reserve_df['eurhuf_rate'].iloc[0]:.4f} = {reserve_df[col_eur].iloc[0]:.2f} EUR")
# reserve_df = reserve_df.drop(columns=reserve_cols_huf + ['date'])
# rename_dict = {col + '_eur': col for col in reserve_cols_huf}
# reserve_df = reserve_df.rename(columns=rename_dict)
# print(f"\n✓ Conversion complete. Reserve prices are now in EUR.")
# print(f"Final columns: {reserve_df.columns.tolist()}")
# reserve_df.tail(10)


### Note: Reserve Data Processing

The reserve data (`reserve_df`) has been processed in the cells above with the following transformations:
1. **Zero prices converted to NaN** - Prices of 0 are now NaN 
2. **Missing hours filled** - Complete hourly timeline created with NaN for missing hours
3. **Flow directions averaged** - When multiple flow directions exist in same hour, prices are averaged
4. **Pivoted to wide format** - Data is in wide format with columns like `reserve_A03`, `reserve_A04`, etc.
5. **Currency converted to EUR** - Original HUF prices converted to EUR using daily EUR/HUF rates from yfinance

The `reserve_df` is ready to merge and already timezone-aware (Europe/Budapest). All reserve prices are now in EUR/MW/h.


# Load day-ahead data


In [11]:
# Load day-ahead data from Montel source
%run day_ahead_montel.ipynb


Week 1/4: 20240108 to 20240115 -> 169 rows
Week 2/4: 20240408 to 20240415 -> 169 rows
Week 3/4: 20240610 to 20240617 -> 169 rows
Week 4/4: 20241007 to 20241014 -> 169 rows

Total downloaded: 676 rows from 4 weeks across 2024 seasons
Sample date values:
0                   NaN
1    [08/01/2024 00:00]
2    [08/01/2024 01:00]
3    [08/01/2024 02:00]
4    [08/01/2024 03:00]
Name: Date (CET), dtype: object

First date type: <class 'float'>
First date value: nan

Converted date column to datetime. Date range: 2024-01-08 00:00:00 to 2024-10-13 23:00:00
Failed conversions: 4
Price column converted to float. Data shape: (676, 1)
Index is datetime: True

Sample data:
                     HUNGARY (HU)
Date (CET)                       
NaT                           NaN
2024-01-08 00:00:00         83.15
2024-01-08 01:00:00         82.73
2024-01-08 02:00:00         77.47
2024-01-08 03:00:00         75.22
Dropped 4 rows with NaN prices
Remaining rows: 672
NaN values remaining: 0


In [12]:
dayahead_df = montel_df

In [13]:
dayahead_df.reset_index(inplace=True)
dayahead_df.rename(columns={"HUNGARY (HU)":"price_dayahead", "Date (CET)":"datetime"}, inplace=True)

In [14]:
dayahead_df 

Unnamed: 0,datetime,price_dayahead
0,2024-01-08 00:00:00,83.15
1,2024-01-08 01:00:00,82.73
2,2024-01-08 02:00:00,77.47
3,2024-01-08 03:00:00,75.22
4,2024-01-08 04:00:00,78.69
...,...,...
667,2024-10-13 19:00:00,121.77
668,2024-10-13 20:00:00,100.22
669,2024-10-13 21:00:00,75.67
670,2024-10-13 22:00:00,56.03


## Reserve Data Dummy

The reserve API is temporarily unavailable. We generate a dummy reserve dataset in EUR/MW/h aligned to the day-ahead timestamps. The series uses a deterministic diurnal pattern with mild weekday effects and noise for realism.


In [15]:
# Build dummy reserve data aligned with day-ahead datetimes (EUR/MW/h)
# Copy datetimes from day-ahead to ensure perfect alignment
reserve_df = pd.DataFrame({'datetime': dayahead_df['datetime'].copy()})

# Ensure timezone alignment with day-ahead
if dayahead_df['datetime'].dt.tz is not None:
    if reserve_df['datetime'].dt.tz is None:
        reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize(dayahead_df['datetime'].dt.tz)
    elif str(reserve_df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
        reserve_df['datetime'] = reserve_df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)

# Deterministic dummy reserve price in EUR/MW/h (seeded for reproducibility)
rng = np.random.default_rng(42)
hours = reserve_df['datetime'].dt.hour.to_numpy()
weekday = reserve_df['datetime'].dt.weekday.to_numpy()
month = reserve_df['datetime'].dt.month.to_numpy()

# Mild correlation with day-ahead energy prices
if 'price_em' in dayahead_df.columns:
    energy_series = dayahead_df['price_em'].to_numpy(dtype=float)
else:
    energy_series = dayahead_df['price_dayahead'].to_numpy(dtype=float)
emin, emax = np.nanmin(energy_series), np.nanmax(energy_series)
energy_norm = (energy_series - emin) / (max(emax - emin, 1e-6))  # [0,1]
energy_norm = np.nan_to_num(energy_norm, nan=0.5)

# Base diurnal with gentle evening uplift → typical base 0.4–1.3 EUR/MW/h
base_sin = 0.85 + 0.35 * np.sin((hours - 6) / 24 * 2 * np.pi)        # ~[0.5, 1.2]
evening_sig = 0.20 / (1.0 + np.exp(-(hours - 17.0) / 1.2))           # adds up to ~0.2 near evening
_diurnal = base_sin + evening_sig

# Seasonal/monthly adjustment (mild)
season_factor = np.ones_like(hours, dtype=float)
season_factor = np.where(np.isin(month, [12, 1, 2]), 1.10, season_factor)  # winter slight uplift
season_factor = np.where(np.isin(month, [6, 7, 8]), 0.95, season_factor)   # summer slight discount
season_factor = np.where(np.isin(month, [9, 10, 11]), 1.05, season_factor) # fall slight uplift

# Weekday/weekend variation (mild)
weekday_uplift = np.where(weekday < 5, 1.03, 0.97)

# Noise (small) and mild energy correlation
noise = rng.normal(0.0, 0.05, size=len(reserve_df))                    # small base noise
energy_component = 0.15 * (energy_norm - 0.5)                          # ~[-0.075, +0.075]

# Combine components → clamp to typical base range 0.4–1.3
base_price = (_diurnal * season_factor * weekday_uplift) + energy_component + noise
base_price = np.clip(base_price, 0.4, 1.3)

# Occasional spikes (~5%) set directly between 3–7 EUR/MW/h
spike_mask = rng.random(len(reserve_df)) < 0.05
spike_values = rng.uniform(3.0, 7.0, size=len(reserve_df))
reserve_price = np.where(spike_mask, spike_values, base_price)

# Hard clip: 0.2 to 8.0 EUR/MW/h (by HUF cap)
reserve_price = np.clip(reserve_price, 0.2, 8.0)

reserve_df['reserve_A03'] = reserve_price.astype(float)

print(f"Dummy reserve data (EUR/MWh) rows: {len(reserve_df)}")
print(reserve_df.head(10))


Dummy reserve data (EUR/MWh) rows: 672
             datetime  reserve_A03
0 2024-01-08 00:00:00     0.582244
1 2024-01-08 01:00:00     0.528331
2 2024-01-08 02:00:00     0.655082
3 2024-01-08 03:00:00     0.726588
4 2024-01-08 04:00:00     0.665712
5 2024-01-08 05:00:00     0.795348
6 2024-01-08 06:00:00     0.975032
7 2024-01-08 07:00:00     1.063251
8 2024-01-08 08:00:00     1.178339
9 2024-01-08 09:00:00     1.217330


In [16]:
# Adjustment: double reserve dummy values (prior guidance was half) and re-clip to [0.4, 16.0]
reserve_df['reserve_A03'] = np.clip(reserve_df['reserve_A03'] * 2.0, 0.4, 16.0)
print("Adjusted reserve_A03 sample:")
print(reserve_df.head(10))


Adjusted reserve_A03 sample:
             datetime  reserve_A03
0 2024-01-08 00:00:00     1.164488
1 2024-01-08 01:00:00     1.056662
2 2024-01-08 02:00:00     1.310165
3 2024-01-08 03:00:00     1.453175
4 2024-01-08 04:00:00     1.331425
5 2024-01-08 05:00:00     1.590697
6 2024-01-08 06:00:00     1.950064
7 2024-01-08 07:00:00     2.126502
8 2024-01-08 08:00:00     2.356679
9 2024-01-08 09:00:00     2.434660


# Load production data


In [17]:
# Refined PV generation: asymmetric curve, daily weather, seasonal noise, weekend factor
# Build daily cloud cover factor (reproducible)
production_df_hourly = dayahead_df.copy()
rng_cloud = np.random.default_rng(12345)
unique_prod_dates = dayahead_df['datetime'].dt.date.unique()
cloud_factor_by_date = {d: 0.5 + 0.5 * rng_cloud.beta(5.0, 2.0) for d in unique_prod_dates}


def generate_pv_power_refined(row):
    dt = row['datetime']
    month = dt.month
    hour = dt.hour
    weekday = dt.weekday()

    # Seasonal capacity (MW)
    if month in [12, 1, 2]:
        max_capacity = 15.0
    elif month in [3, 4, 5]:
        max_capacity = 18.0
    elif month in [6, 7, 8]:
        max_capacity = 20.0
    else:
        max_capacity = 16.0

    # Asymmetric daily shape with noon plateau (6–18)
    if 6 <= hour < 10:
        base = ((hour - 6) / 4.0) ** 1.5
    elif 10 <= hour <= 14:
        base = 1.0
    elif 14 < hour <= 18:
        base = ((18.0 - hour) / 4.0) ** 1.2
    else:
        base = 0.0
    base = float(np.clip(base, 0.0, 1.0))

    # Weather (daily cloud clearness 0.5–1.0) and mild weekend factor
    clear = float(cloud_factor_by_date.get(dt.date(), 0.8))
    weekend_factor = 0.98 if weekday >= 5 else 1.0

    # Seasonal noise amplitude
    if month in [12, 1, 2]:
        sigma = 0.07 * max_capacity
    elif month in [6, 7, 8]:
        sigma = 0.03 * max_capacity
    else:
        sigma = 0.05 * max_capacity

    # Deterministic seed above affects np.random noise
    power = max_capacity * base * clear * weekend_factor
    power += np.random.normal(0.0, sigma)
    return float(np.clip(power, 0.0, max_capacity))

# Recompute PV with refined generator (reproducible noise)
np.random.seed(42)
production_df_hourly['pv_power_mw'] = production_df_hourly.apply(generate_pv_power_refined, axis=1)

print(f"Refined PV: range {production_df_hourly['pv_power_mw'].min():.2f}–{production_df_hourly['pv_power_mw'].max():.2f} MW")
production_df_hourly.head(24)


Refined PV: range 0.00–20.00 MW


Unnamed: 0,datetime,price_dayahead,pv_power_mw
0,2024-01-08 00:00:00,83.15,0.52155
1,2024-01-08 01:00:00,82.73,0.0
2,2024-01-08 02:00:00,77.47,0.680073
3,2024-01-08 03:00:00,75.22,1.599181
4,2024-01-08 04:00:00,78.69,0.0
5,2024-01-08 05:00:00,82.1,0.0
6,2024-01-08 06:00:00,94.3,1.658173
7,2024-01-08 07:00:00,111.4,2.437657
8,2024-01-08 08:00:00,121.11,4.122623
9,2024-01-08 09:00:00,117.83,9.049034


In [18]:
# Load production data - COMMENTED OUT (using dummy data below)
# %run 01_production_data.ipynb


# Merge Data Sources (Hourly Granularity)

All data sources are now at hourly resolution:
- Production: aggregated from 15-min to hourly (mean of 4 intervals)
- Day-ahead prices: native hourly
- Reserve prices: native hourly (with NaN for missing hours)


In [19]:
# Start with production (hourly granularity)
df = production_df_hourly[['datetime', 'pv_power_mw']].copy()

# Ensure dayahead_df is timezone-aware
if dayahead_df['datetime'].dt.tz is None:
    dayahead_df['datetime'] = dayahead_df['datetime'].dt.tz_localize('Europe/Budapest')

# CRITICAL: Ensure df['datetime'] matches dayahead_df['datetime'] timezone exactly
# This fixes the merge error (datetime64[ns] vs datetime64[ns, Europe/Budapest])
if dayahead_df['datetime'].dt.tz is not None:
    # dayahead_df is timezone-aware, make df match
    if df['datetime'].dt.tz is None:
        df['datetime'] = df['datetime'].dt.tz_localize('Europe/Budapest')
    elif str(df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
        df['datetime'] = df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)
else:
    # dayahead_df is naive, make df naive too
    if df['datetime'].dt.tz is not None:
        df['datetime'] = df['datetime'].dt.tz_localize(None)

# Ensure reserve_df datetime also matches
if 'datetime' in reserve_df.columns:
    if dayahead_df['datetime'].dt.tz is not None:
        if reserve_df['datetime'].dt.tz is None:
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize('Europe/Budapest')
        elif str(reserve_df['datetime'].dt.tz) != str(dayahead_df['datetime'].dt.tz):
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_convert(dayahead_df['datetime'].dt.tz)
    else:
        if reserve_df['datetime'].dt.tz is not None:
            reserve_df['datetime'] = reserve_df['datetime'].dt.tz_localize(None)

# Merge day-ahead prices (hourly -> exact match)
df = pd.merge(
    df,
    dayahead_df[['datetime', 'price_dayahead']],
    on='datetime',
    how='left'
)

# Merge reserve prices (already pivoted by PSR type, with NaN for missing hours)
df = pd.merge(
    df,
    reserve_df,
    on='datetime',
    how='left'
)

print(f"Merged data: {len(df)} records (hourly)")
print(f"Columns: {df.columns.tolist()}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"\nData completeness:")
print(f"  Production (pv_power_mw): {df['pv_power_mw'].notna().sum()} valid, {df['pv_power_mw'].isna().sum()} NaN")
print(f"  Day-ahead prices: {df['price_dayahead'].notna().sum()} valid, {df['price_dayahead'].isna().sum()} NaN")
for col in df.columns:
    if col.startswith('reserve_'):
        nan_count = df[col].isna().sum()
        valid_count = df[col].notna().sum()
        print(f"  {col}: {valid_count} valid, {nan_count} NaN ({nan_count/len(df)*100:.1f}%)")
        
print("\nFirst 24 hours:")
df.head(24)


Merged data: 672 records (hourly)
Columns: ['datetime', 'pv_power_mw', 'price_dayahead', 'reserve_A03']
Date range: 2024-01-08 00:00:00+01:00 to 2024-10-13 23:00:00+02:00

Data completeness:
  Production (pv_power_mw): 672 valid, 0 NaN
  Day-ahead prices: 672 valid, 0 NaN
  reserve_A03: 672 valid, 0 NaN (0.0%)

First 24 hours:


Unnamed: 0,datetime,pv_power_mw,price_dayahead,reserve_A03
0,2024-01-08 00:00:00+01:00,0.52155,83.15,1.164488
1,2024-01-08 01:00:00+01:00,0.0,82.73,1.056662
2,2024-01-08 02:00:00+01:00,0.680073,77.47,1.310165
3,2024-01-08 03:00:00+01:00,1.599181,75.22,1.453175
4,2024-01-08 04:00:00+01:00,0.0,78.69,1.331425
5,2024-01-08 05:00:00+01:00,0.0,82.1,1.590697
6,2024-01-08 06:00:00+01:00,1.658173,94.3,1.950064
7,2024-01-08 07:00:00+01:00,2.437657,111.4,2.126502
8,2024-01-08 08:00:00+01:00,4.122623,121.11,2.356679
9,2024-01-08 09:00:00+01:00,9.049034,117.83,2.43466


In [20]:
df.head(50)

Unnamed: 0,datetime,pv_power_mw,price_dayahead,reserve_A03
0,2024-01-08 00:00:00+01:00,0.52155,83.15,1.164488
1,2024-01-08 01:00:00+01:00,0.0,82.73,1.056662
2,2024-01-08 02:00:00+01:00,0.680073,77.47,1.310165
3,2024-01-08 03:00:00+01:00,1.599181,75.22,1.453175
4,2024-01-08 04:00:00+01:00,0.0,78.69,1.331425
5,2024-01-08 05:00:00+01:00,0.0,82.1,1.590697
6,2024-01-08 06:00:00+01:00,1.658173,94.3,1.950064
7,2024-01-08 07:00:00+01:00,2.437657,111.4,2.126502
8,2024-01-08 08:00:00+01:00,4.122623,121.11,2.356679
9,2024-01-08 09:00:00+01:00,9.049034,117.83,2.43466


## Feature Engineering - Temporal Features (3)


In [21]:
# 1. k - Hour of day (0-23)
df['k'] = df['datetime'].dt.hour

# 2. weekday - Day of week (0=Mon, 6=Sun)
df['weekday'] = df['datetime'].dt.weekday

# 3. season - Season (0=Winter, 1=Spring, 2=Summer, 3=Fall)
def get_season(month):
    if month in [12, 1, 2]:
        return 0  # Winter
    elif month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    else:
        return 3  # Fall

df['season'] = df['datetime'].dt.month.apply(get_season)

print("Temporal features:")
print(df[['datetime', 'k', 'weekday', 'season']].head(10))


Temporal features:
                   datetime  k  weekday  season
0 2024-01-08 00:00:00+01:00  0        0       0
1 2024-01-08 01:00:00+01:00  1        0       0
2 2024-01-08 02:00:00+01:00  2        0       0
3 2024-01-08 03:00:00+01:00  3        0       0
4 2024-01-08 04:00:00+01:00  4        0       0
5 2024-01-08 05:00:00+01:00  5        0       0
6 2024-01-08 06:00:00+01:00  6        0       0
7 2024-01-08 07:00:00+01:00  7        0       0
8 2024-01-08 08:00:00+01:00  8        0       0
9 2024-01-08 09:00:00+01:00  9        0       0


## Feature Engineering - Price & Operations (3 direct + 2 env_state placeholders)


In [22]:
# 4. price_em - Day-ahead energy price (HUF/MWh)
df['price_em'] = df['price_dayahead']

# 5. price_as - Reserve price (use A03 as primary, fallback to others)
if 'reserve_A03' in df.columns:
    df['price_as'] = df['reserve_A03']
elif 'reserve_A04' in df.columns:
    df['price_as'] = df['reserve_A04']
elif 'reserve_A05' in df.columns:
    df['price_as'] = df['reserve_A05']
else:
    df['price_as'] = 0.0

# 6. p_res_total - RES available generation (MW)
df['p_res_total'] = df['pv_power_mw']

# 7. soc - Battery SOC (placeholder, will be filled by environment)
df['soc'] = 0.5  # Initial placeholder

# 8. dod - DOD (placeholder, will be filled by environment)
df['dod'] = 0.0  # Initial placeholder

print("Price & operations features:")
print(df[['datetime', 'price_em', 'price_as', 'p_res_total', 'soc', 'dod']].head())


Price & operations features:
                   datetime  price_em  price_as  p_res_total  soc  dod
0 2024-01-08 00:00:00+01:00     83.15  1.164488     0.521550  0.5  0.0
1 2024-01-08 01:00:00+01:00     82.73  1.056662     0.000000  0.5  0.0
2 2024-01-08 02:00:00+01:00     77.47  1.310165     0.680073  0.5  0.0
3 2024-01-08 03:00:00+01:00     75.22  1.453175     1.599181  0.5  0.0
4 2024-01-08 04:00:00+01:00     78.69  1.331425     0.000000  0.5  0.0


In [23]:
df

Unnamed: 0,datetime,pv_power_mw,price_dayahead,reserve_A03,k,weekday,season,price_em,price_as,p_res_total,soc,dod
0,2024-01-08 00:00:00+01:00,0.521550,83.15,1.164488,0,0,0,83.15,1.164488,0.521550,0.5,0.0
1,2024-01-08 01:00:00+01:00,0.000000,82.73,1.056662,1,0,0,82.73,1.056662,0.000000,0.5,0.0
2,2024-01-08 02:00:00+01:00,0.680073,77.47,1.310165,2,0,0,77.47,1.310165,0.680073,0.5,0.0
3,2024-01-08 03:00:00+01:00,1.599181,75.22,1.453175,3,0,0,75.22,1.453175,1.599181,0.5,0.0
4,2024-01-08 04:00:00+01:00,0.000000,78.69,1.331425,4,0,0,78.69,1.331425,0.000000,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
667,2024-10-13 19:00:00+02:00,1.203194,121.77,1.973866,19,6,3,121.77,1.973866,1.203194,0.5,0.0
668,2024-10-13 20:00:00+02:00,0.000000,100.22,1.715703,20,6,3,100.22,1.715703,0.000000,0.5,0.0
669,2024-10-13 21:00:00+02:00,0.873205,75.67,1.358440,21,6,3,75.67,1.358440,0.873205,0.5,0.0
670,2024-10-13 22:00:00+02:00,0.996868,56.03,1.570091,22,6,3,56.03,1.570091,0.996868,0.5,0.0


## Feature Engineering - Daily Market Context (8)


In [24]:
# Add date column for grouping
df['date'] = df['datetime'].dt.date

# Window definitions from config (half-day split at hour 12)
morning_hours = range(0, 12)  # [0-11]
evening_hours = range(12, 24) # [12-23]

# Function to compute daily features
def compute_daily_features(group):
    morning_mask = group['k'].isin(morning_hours)
    evening_mask = group['k'].isin(evening_hours)
    
    # Morning max price and hour
    morning_prices = group.loc[morning_mask, 'price_em']
    if len(morning_prices) > 0 and not morning_prices.isna().all():
        group['price_em_max_morning'] = morning_prices.max()
        group['k_em_max_morning'] = group.loc[morning_mask & (group['price_em'] == morning_prices.max()), 'k'].iloc[0]
    else:
        group['price_em_max_morning'] = 0.0
        group['k_em_max_morning'] = 6  # Default morning peak
    
    # Evening max price and hour
    evening_prices = group.loc[evening_mask, 'price_em']
    if len(evening_prices) > 0 and not evening_prices.isna().all():
        group['price_em_max_evening'] = evening_prices.max()
        group['k_em_max_evening'] = group.loc[evening_mask & (group['price_em'] == evening_prices.max()), 'k'].iloc[0]
    else:
        group['price_em_max_evening'] = 0.0
        group['k_em_max_evening'] = 18  # Default evening peak
    
    # Daily min price and hour
    if not group['price_em'].isna().all():
        group['price_em_min'] = group['price_em'].min()
        group['k_em_min'] = group.loc[group['price_em'] == group['price_em'].min(), 'k'].iloc[0]
    else:
        group['price_em_min'] = 0.0
        group['k_em_min'] = 3  # Default night hour
    
    # Daily reserve price min/max
    if not group['price_as'].isna().all():
        group['price_as_min'] = group['price_as'].min()
        group['price_as_max'] = group['price_as'].max()
    else:
        group['price_as_min'] = 0.0
        group['price_as_max'] = 0.0
    
    return group

# Apply daily feature computation
df = df.groupby('date', group_keys=False).apply(compute_daily_features)

print("Daily market context features:")
print(df[['datetime', 'price_em_max_morning', 'k_em_max_morning', 
          'price_em_max_evening', 'k_em_max_evening',
          'price_em_min', 'k_em_min', 'price_as_min', 'price_as_max']].head(24))


Daily market context features:
                    datetime  price_em_max_morning  k_em_max_morning  \
0  2024-01-08 00:00:00+01:00                121.11                 8   
1  2024-01-08 01:00:00+01:00                121.11                 8   
2  2024-01-08 02:00:00+01:00                121.11                 8   
3  2024-01-08 03:00:00+01:00                121.11                 8   
4  2024-01-08 04:00:00+01:00                121.11                 8   
5  2024-01-08 05:00:00+01:00                121.11                 8   
6  2024-01-08 06:00:00+01:00                121.11                 8   
7  2024-01-08 07:00:00+01:00                121.11                 8   
8  2024-01-08 08:00:00+01:00                121.11                 8   
9  2024-01-08 09:00:00+01:00                121.11                 8   
10 2024-01-08 10:00:00+01:00                121.11                 8   
11 2024-01-08 11:00:00+01:00                121.11                 8   
12 2024-01-08 12:00:00+01:00     

  df = df.groupby('date', group_keys=False).apply(compute_daily_features)


In [25]:
## Feature Engineering - Future-aware signals (4)

# Compute per-day future deltas and peak-aware signals
def compute_future_features(group: pd.DataFrame) -> pd.DataFrame:
    prices = group['price_em'].to_numpy(dtype=float)
    k_vals = group['k'].to_numpy(dtype=int)
    n = len(group)

    if n == 0:
        group['max_future_price_delta'] = np.nan
        group['avg_future_price_delta'] = np.nan
        group['peak_hour_price_delta'] = np.nan
        group['time_to_peak_hour'] = 0
        return group

    # Determine daily peak hour directly from the data
    peak_idx = int(np.nanargmax(prices))
    peak_hour = int(k_vals[peak_idx])
    peak_price = float(prices[peak_idx])

    max_future_delta = np.zeros(n, dtype=float)
    avg_future_delta = np.zeros(n, dtype=float)
    peak_hour_delta = np.zeros(n, dtype=float)
    time_to_peak = np.zeros(n, dtype=int)

    for i in range(n):
        if i < n - 1:
            future = prices[i+1:]
            max_future = float(np.nanmax(future))
            avg_future = float(np.nanmean(future))
            max_future_delta[i] = max_future - prices[i]
            avg_future_delta[i] = avg_future - prices[i]
        else:
            # No future information for the last hour
            max_future_delta[i] = 0.0
            avg_future_delta[i] = 0.0
        peak_hour_delta[i] = peak_price - prices[i]
        t = int(k_vals[i])
        time_to_peak[i] = max(peak_hour - t, 0)

    group['max_future_price_delta'] = max_future_delta
    group['avg_future_price_delta'] = avg_future_delta
    group['peak_hour_price_delta'] = peak_hour_delta
    group['time_to_peak_hour'] = time_to_peak
    return group

# Apply per-day future-aware features
df = df.groupby('date', group_keys=False).apply(compute_future_features)

print("Added future-aware features: max_future_price_delta, avg_future_price_delta, peak_hour_price_delta, time_to_peak_hour")
print(df[['datetime','price_em','k','max_future_price_delta','avg_future_price_delta','peak_hour_price_delta','time_to_peak_hour']].head(24))


Added future-aware features: max_future_price_delta, avg_future_price_delta, peak_hour_price_delta, time_to_peak_hour
                    datetime  price_em   k  max_future_price_delta  \
0  2024-01-08 00:00:00+01:00     83.15   0                   47.45   
1  2024-01-08 01:00:00+01:00     82.73   1                   47.87   
2  2024-01-08 02:00:00+01:00     77.47   2                   53.13   
3  2024-01-08 03:00:00+01:00     75.22   3                   55.38   
4  2024-01-08 04:00:00+01:00     78.69   4                   51.91   
5  2024-01-08 05:00:00+01:00     82.10   5                   48.50   
6  2024-01-08 06:00:00+01:00     94.30   6                   36.30   
7  2024-01-08 07:00:00+01:00    111.40   7                   19.20   
8  2024-01-08 08:00:00+01:00    121.11   8                    9.49   
9  2024-01-08 09:00:00+01:00    117.83   9                   12.77   
10 2024-01-08 10:00:00+01:00    110.91  10                   19.69   
11 2024-01-08 11:00:00+01:00    103.09  11

  df = df.groupby('date', group_keys=False).apply(compute_future_features)


## Create Final Feature Vector (20 dimensions)


In [26]:
# Define features in exact order (extended with 4 future-aware signals)
feature_columns = [
    'k',                      # 1. Hour of day
    'weekday',                # 2. Day of week
    'season',                 # 3. Season
    'price_em',               # 4. Day-ahead price
    'price_as',               # 5. Reserve price
    'p_res_total',            # 6. RES generation
    'soc',                    # 7. Battery SOC (env state)
    'dod',                    # 8. DOD (env state)
    'price_em_max_morning',   # 9. Morning max price
    'price_em_max_evening',   # 10. Evening max price
    'k_em_max_morning',       # 11. Morning max hour
    'k_em_max_evening',       # 12. Evening max hour
    'price_em_min',           # 13. Daily min price
    'k_em_min',               # 14. Daily min hour
    'price_as_min',           # 15. Daily reserve min
    'price_as_max',           # 16. Daily reserve max
    'max_future_price_delta', # 17. Future-aware: max(price[t+1:]) - price[t]
    'avg_future_price_delta', # 18. Future-aware: mean(price[t+1:]) - price[t]
    'peak_hour_price_delta',  # 19. Future-aware: price[peak_hour] - price[t]
    'time_to_peak_hour',      # 20. Future-aware: peak_hour - t (>=0)
]

# Select final features
features_df = df[['datetime'] + feature_columns].copy()

print(f"Final feature vector shape: {features_df.shape}")
print(f"Features ({len(feature_columns)}): {feature_columns}")
print("\nFirst 24 hours:")
print(features_df.head(24))


Final feature vector shape: (672, 21)
Features (20): ['k', 'weekday', 'season', 'price_em', 'price_as', 'p_res_total', 'soc', 'dod', 'price_em_max_morning', 'price_em_max_evening', 'k_em_max_morning', 'k_em_max_evening', 'price_em_min', 'k_em_min', 'price_as_min', 'price_as_max', 'max_future_price_delta', 'avg_future_price_delta', 'peak_hour_price_delta', 'time_to_peak_hour']

First 24 hours:
                    datetime   k  weekday  season  price_em   price_as  \
0  2024-01-08 00:00:00+01:00   0        0       0     83.15   1.164488   
1  2024-01-08 01:00:00+01:00   1        0       0     82.73   1.056662   
2  2024-01-08 02:00:00+01:00   2        0       0     77.47   1.310165   
3  2024-01-08 03:00:00+01:00   3        0       0     75.22   1.453175   
4  2024-01-08 04:00:00+01:00   4        0       0     78.69   1.331425   
5  2024-01-08 05:00:00+01:00   5        0       0     82.10   1.590697   
6  2024-01-08 06:00:00+01:00   6        0       0     94.30   1.950064   
7  2024-01-0

In [27]:
# Handle NaN values before normalization
print("Handling NaN values:\n")

# Day-ahead price features: interpolate (linear)
dayahead_features = ['price_em', 'price_em_max_morning', 'price_em_max_evening', 'price_em_min']

print("Day-ahead price features (interpolated):")
for feature in dayahead_features:
    if feature in features_df.columns:
        nan_indices = features_df[features_df[feature].isna()].index
        nan_count = len(nan_indices)
        
        if nan_count > 0:
            # Store original NaN values info
            print(f"\n  {feature}:")
            print(f"    NaN count: {nan_count}")
            print(f"    NaN at indices: {nan_indices.tolist()}")
            print(f"    NaN at hours: {features_df.loc[nan_indices, 'datetime'].tolist()}")
            
            # Interpolate
            features_df[feature] = features_df[feature].interpolate(method='linear', limit_direction='both')
            
            # Show interpolated values
            print(f"    Interpolated values: {features_df.loc[nan_indices, feature].tolist()}")

# Reserve price features: fill with 0 (no market agreement)
reserve_features = ['price_as', 'price_as_min', 'price_as_max']

print("\n\nReserve price features (filled with 0):")
for feature in reserve_features:
    if feature in features_df.columns:
        nan_count = features_df[feature].isna().sum()
        if nan_count > 0:
            print(f"  {feature:25s}: {nan_count} NaN → 0")
            features_df[feature] = features_df[feature].fillna(0)

# Verify no NaN values remain
total_nan = features_df[feature_columns].isna().sum().sum()
print(f"\n{'='*80}")
print(f"✓ Total NaN values after processing: {total_nan}")
print(f"✓ Day-ahead prices: interpolated")
print(f"✓ Reserve prices: filled with 0")
print(f"{'='*80}")

features_df.head(30)


Handling NaN values:

Day-ahead price features (interpolated):


Reserve price features (filled with 0):

✓ Total NaN values after processing: 0
✓ Day-ahead prices: interpolated
✓ Reserve prices: filled with 0


Unnamed: 0,datetime,k,weekday,season,price_em,price_as,p_res_total,soc,dod,price_em_max_morning,...,k_em_max_morning,k_em_max_evening,price_em_min,k_em_min,price_as_min,price_as_max,max_future_price_delta,avg_future_price_delta,peak_hour_price_delta,time_to_peak_hour
0,2024-01-08 00:00:00+01:00,0,0,0,83.15,1.164488,0.52155,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,47.45,19.64087,47.45,17
1,2024-01-08 01:00:00+01:00,1,0,0,82.73,1.056662,0.0,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,47.87,20.972727,47.87,16
2,2024-01-08 02:00:00+01:00,2,0,0,77.47,1.310165,0.680073,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,53.13,27.481905,53.13,15
3,2024-01-08 03:00:00+01:00,3,0,0,75.22,1.453175,1.599181,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,55.38,31.2185,55.38,14
4,2024-01-08 04:00:00+01:00,4,0,0,78.69,1.331425,0.0,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,51.91,29.208947,51.91,13
5,2024-01-08 05:00:00+01:00,5,0,0,82.1,1.590697,0.0,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,48.5,27.232222,48.5,12
6,2024-01-08 06:00:00+01:00,6,0,0,94.3,1.950064,1.658173,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,36.3,15.916471,36.3,11
7,2024-01-08 07:00:00+01:00,7,0,0,111.4,2.126502,2.437657,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,19.2,-1.2575,19.2,10
8,2024-01-08 08:00:00+01:00,8,0,0,121.11,2.356679,4.122623,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,9.49,-11.698667,9.49,9
9,2024-01-08 09:00:00+01:00,9,0,0,117.83,2.43466,9.049034,0.5,0.0,121.11,...,8,17,75.22,3,1.056662,12.677661,12.77,-9.02,12.77,8


## Normalization to [-1, 1] Range

Apply min-max normalization to all features following the article's specification.

Formula: `normalized = 2 * (value - min) / (max - min) - 1`

This maps [min, max] → [-1, 1]


In [28]:
# Step 1: Calculate normalization parameters from the dataset
# Skip soc and dod (env_state placeholders)
# Skip datetime (not a feature)

normalization_params = {}

# Fixed ranges for features that should not be data-dependent
fixed_ranges = {
    'k': (0, 23),           # Hour of day always 0-23
    'weekday': (0, 6),      # Day of week always 0-6
    'season': (0, 3),       # Season always 0-3 (even if test data only has one season)
}

features_to_normalize = [
    'k', 'weekday', 'season',                          # Temporal (3)
    'price_em', 'price_as', 'p_res_total',            # Prices & operations (3, skip soc/dod)
    'price_em_max_morning', 'price_em_max_evening',   # Daily context (8)
    'k_em_max_morning', 'k_em_max_evening',
    'price_em_min', 'k_em_min',
    'price_as_min', 'price_as_max',
    # Future-aware signals (4)
    'max_future_price_delta', 'avg_future_price_delta',
    'peak_hour_price_delta', 'time_to_peak_hour'
]

print("Calculating normalization parameters:\n")
print("(* = fixed range, not calculated from data)\n")

for feature in features_to_normalize:
    if feature in fixed_ranges:
        # Use fixed range
        feature_min, feature_max = fixed_ranges[feature]
        source = "*"
    else:
        # Calculate min/max from data, ignoring NaN values
        feature_min = features_df[feature].min()
        feature_max = features_df[feature].max()
        source = " "
    
    normalization_params[feature] = {
        'min': float(feature_min),
        'max': float(feature_max),
        'range': float(feature_max - feature_min)
    }
    
    print(f"{source} {feature:25s}: min={feature_min:12.4f}, max={feature_max:12.4f}, range={feature_max - feature_min:12.4f}")

print(f"\n✓ Calculated normalization parameters for {len(normalization_params)} features")
print(f"✓ Fixed ranges used for: {', '.join(fixed_ranges.keys())}")


Calculating normalization parameters:

(* = fixed range, not calculated from data)

* k                        : min=      0.0000, max=     23.0000, range=     23.0000
* weekday                  : min=      0.0000, max=      6.0000, range=      6.0000
* season                   : min=      0.0000, max=      3.0000, range=      3.0000
  price_em                 : min=    -83.2900, max=    247.3500, range=    330.6400
  price_as                 : min=      0.8000, max=     13.8175, range=     13.0175
  p_res_total              : min=      0.0000, max=     20.0000, range=     20.0000
  price_em_max_morning     : min=      5.1900, max=    175.5100, range=    170.3200
  price_em_max_evening     : min=     77.1500, max=    247.3500, range=    170.2000
  k_em_max_morning         : min=      0.0000, max=     11.0000, range=     11.0000
  k_em_max_evening         : min=     16.0000, max=     21.0000, range=      5.0000
  price_em_min             : min=    -83.2900, max=     85.8100, range=    1

In [29]:
# Step 2: Apply min-max normalization to [-1, 1]
# Formula: normalized = 2 * (value - min) / (max - min) - 1
# Preserve NaN values in reserve prices (no filling)

features_df_normalized = features_df.copy()

print("Applying min-max normalization to [-1, 1]:\n")

for feature in features_to_normalize:
    params = normalization_params[feature]
    min_val = params['min']
    max_val = params['max']
    range_val = params['range']
    
    # Apply normalization formula
    # Handle case where min == max (constant feature)
    if range_val > 0:
        features_df_normalized[feature] = 2 * (features_df[feature] - min_val) / range_val - 1
    else:
        # If constant, map to 0 (center of [-1, 1])
        features_df_normalized[feature] = 0.0
    
    # Show before/after stats
    original_range = f"[{min_val:.4f}, {max_val:.4f}]"
    norm_min = features_df_normalized[feature].min()
    norm_max = features_df_normalized[feature].max()
    norm_range = f"[{norm_min:.4f}, {norm_max:.4f}]"
    nan_count = features_df_normalized[feature].isna().sum()
    
    print(f"{feature:25s}: {original_range:30s} → {norm_range:25s} ({nan_count} NaN)")

print(f"\n✓ Normalization complete. All features scaled to [-1, 1] range.")
print(f"✓ NaN values preserved (no imputation for missing reserve prices)")


Applying min-max normalization to [-1, 1]:

k                        : [0.0000, 23.0000]              → [-1.0000, 1.0000]         (0 NaN)
weekday                  : [0.0000, 6.0000]               → [-1.0000, 1.0000]         (0 NaN)
season                   : [0.0000, 3.0000]               → [-1.0000, 1.0000]         (0 NaN)
price_em                 : [-83.2900, 247.3500]           → [-1.0000, 1.0000]         (0 NaN)
price_as                 : [0.8000, 13.8175]              → [-1.0000, 1.0000]         (0 NaN)
p_res_total              : [0.0000, 20.0000]              → [-1.0000, 1.0000]         (0 NaN)
price_em_max_morning     : [5.1900, 175.5100]             → [-1.0000, 1.0000]         (0 NaN)
price_em_max_evening     : [77.1500, 247.3500]            → [-1.0000, 1.0000]         (0 NaN)
k_em_max_morning         : [0.0000, 11.0000]              → [-1.0000, 1.0000]         (0 NaN)
k_em_max_evening         : [16.0000, 21.0000]             → [-1.0000, 1.0000]         (0 NaN)
price_em_min    

In [30]:
# Step 3: Save normalization parameters for future use
import yaml

norm_params_dict = {
    'normalization': {
        'method': 'minmax',
        'target_range': [-1, 1],
        'formula': '2 * (value - min) / (max - min) - 1'
    },
    'features': {}
}

# Add each feature's parameters
for feature, params in normalization_params.items():
    norm_params_dict['features'][feature] = {
        'min': params['min'],
        'max': params['max'],
        'range': params['range']
    }

# Save to file
output_path = Path('../data/processed/normalization_params.yaml')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    yaml.dump(norm_params_dict, f, default_flow_style=False, sort_keys=False)

print(f"✓ Saved normalization parameters to {output_path}")
print(f"  Total features: {len(normalization_params)}")
print(f"  File size: {output_path.stat().st_size} bytes")


✓ Saved normalization parameters to ..\data\processed\normalization_params.yaml
  Total features: 18
  File size: 1465 bytes


In [31]:
# Step 4: Data Quality Check

print("=" * 80)
print("NORMALIZED FEATURE DATASET - QUALITY REPORT")
print("=" * 80)

print(f"\nDataset shape: {features_df_normalized.shape}")
print(f"Date range: {features_df_normalized['datetime'].min()} to {features_df_normalized['datetime'].max()}")
print(f"Total hours: {len(features_df_normalized)}")

print("\n" + "=" * 80)
print("MISSING VALUES (NaN) PER FEATURE")
print("=" * 80)

for feature in feature_columns:
    nan_count = features_df_normalized[feature].isna().sum()
    nan_pct = (nan_count / len(features_df_normalized)) * 100
    print(f"{feature:25s}: {nan_count:4d} NaN ({nan_pct:5.2f}%)")

print("\n" + "=" * 80)
print("NORMALIZED FEATURE RANGES (should be ≈ [-1, 1])")
print("=" * 80)

for feature in features_to_normalize:
    min_val = features_df_normalized[feature].min()
    max_val = features_df_normalized[feature].max()
    mean_val = features_df_normalized[feature].mean()
    std_val = features_df_normalized[feature].std()
    print(f"{feature:25s}: min={min_val:7.4f}, max={max_val:7.4f}, mean={mean_val:7.4f}, std={std_val:7.4f}")

print("\n" + "=" * 80)
print("SAMPLE DATA (First 24 hours)")
print("=" * 80)
print(features_df_normalized.head(24))


NORMALIZED FEATURE DATASET - QUALITY REPORT

Dataset shape: (672, 21)
Date range: 2024-01-08 00:00:00+01:00 to 2024-10-13 23:00:00+02:00
Total hours: 672

MISSING VALUES (NaN) PER FEATURE
k                        :    0 NaN ( 0.00%)
weekday                  :    0 NaN ( 0.00%)
season                   :    0 NaN ( 0.00%)
price_em                 :    0 NaN ( 0.00%)
price_as                 :    0 NaN ( 0.00%)
p_res_total              :    0 NaN ( 0.00%)
soc                      :    0 NaN ( 0.00%)
dod                      :    0 NaN ( 0.00%)
price_em_max_morning     :    0 NaN ( 0.00%)
price_em_max_evening     :    0 NaN ( 0.00%)
k_em_max_morning         :    0 NaN ( 0.00%)
k_em_max_evening         :    0 NaN ( 0.00%)
price_em_min             :    0 NaN ( 0.00%)
k_em_min                 :    0 NaN ( 0.00%)
price_as_min             :    0 NaN ( 0.00%)
price_as_max             :    0 NaN ( 0.00%)
max_future_price_delta   :    0 NaN ( 0.00%)
avg_future_price_delta   :    0 NaN ( 0.00%)
pe

In [32]:
# Step 5: Save normalized dataset

output_path = Path('../data/processed/training_features_normalized.parquet')
output_path.parent.mkdir(parents=True, exist_ok=True)

features_df_normalized.to_parquet(output_path, index=False)

print("=" * 80)
print("DATASET SAVED")
print("=" * 80)
print(f"\n✓ Saved normalized feature dataset to: {output_path}")
print(f"  File size: {output_path.stat().st_size / 1024:.2f} KB")
print(f"  Shape: {features_df_normalized.shape}")
print(f"  Columns: {len(features_df_normalized.columns)}")
print(f"  - datetime (1)")
print(f"  - features (16)")
print(f"\nFeature order (matches features.yaml):")
for i, feature in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {feature}")

print("\n" + "=" * 80)
print("READY FOR TRAINING")
print("=" * 80)
print("\nNext steps:")
print("  1. Load this dataset in your training script")
print("  2. Use normalization_params.yaml to normalize new/test data")
print("  3. soc and dod placeholders will be replaced by environment during training")
print("  4. NaN values in reserve prices indicate no market agreement at those hours")


DATASET SAVED

✓ Saved normalized feature dataset to: ..\data\processed\training_features_normalized.parquet
  File size: 54.19 KB
  Shape: (672, 21)
  Columns: 21
  - datetime (1)
  - features (16)

Feature order (matches features.yaml):
   1. k
   2. weekday
   3. season
   4. price_em
   5. price_as
   6. p_res_total
   7. soc
   8. dod
   9. price_em_max_morning
  10. price_em_max_evening
  11. k_em_max_morning
  12. k_em_max_evening
  13. price_em_min
  14. k_em_min
  15. price_as_min
  16. price_as_max
  17. max_future_price_delta
  18. avg_future_price_delta
  19. peak_hour_price_delta
  20. time_to_peak_hour

READY FOR TRAINING

Next steps:
  1. Load this dataset in your training script
  2. Use normalization_params.yaml to normalize new/test data
  3. soc and dod placeholders will be replaced by environment during training
  4. NaN values in reserve prices indicate no market agreement at those hours


In [33]:
features_df_normalized

Unnamed: 0,datetime,k,weekday,season,price_em,price_as,p_res_total,soc,dod,price_em_max_morning,...,k_em_max_morning,k_em_max_evening,price_em_min,k_em_min,price_as_min,price_as_max,max_future_price_delta,avg_future_price_delta,peak_hour_price_delta,time_to_peak_hour
0,2024-01-08 00:00:00+01:00,-1.000000,-1.0,-1.0,0.006775,-0.944000,-0.947845,0.5,0.0,0.361202,...,0.454545,-0.6,0.874749,-0.904762,0.383343,0.804594,-0.199855,0.060395,-0.638228,0.619048
1,2024-01-08 01:00:00+01:00,-0.913043,-1.0,-1.0,0.004234,-0.960567,-1.000000,0.5,0.0,0.361202,...,0.454545,-0.6,0.874749,-0.904762,0.383343,0.804594,-0.197509,0.069083,-0.635026,0.523810
2,2024-01-08 02:00:00+01:00,-0.826087,-1.0,-1.0,-0.027583,-0.921619,-0.931993,0.5,0.0,0.361202,...,0.454545,-0.6,0.874749,-0.904762,0.383343,0.804594,-0.168137,0.111544,-0.594922,0.428571
3,2024-01-08 03:00:00+01:00,-0.739130,-1.0,-1.0,-0.041193,-0.899647,-0.840082,0.5,0.0,0.361202,...,0.454545,-0.6,0.874749,-0.904762,0.383343,0.804594,-0.155573,0.135919,-0.577768,0.333333
4,2024-01-08 04:00:00+01:00,-0.652174,-1.0,-1.0,-0.020203,-0.918352,-1.000000,0.5,0.0,0.361202,...,0.454545,-0.6,0.874749,-0.904762,0.383343,0.804594,-0.174950,0.122810,-0.604224,0.238095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2024-10-13 19:00:00+02:00,0.652174,1.0,1.0,0.240382,-0.819649,-0.879681,0.5,0.0,-1.000000,...,0.454545,0.2,-0.165464,0.047619,-0.303051,0.733435,-0.585157,-0.395212,-1.000000,-1.000000
668,2024-10-13 20:00:00+02:00,0.739130,1.0,1.0,0.110029,-0.859312,-1.000000,0.5,0.0,-1.000000,...,0.454545,0.2,-0.165464,0.047619,-0.303051,0.733435,-0.601910,-0.316938,-0.835697,-1.000000
669,2024-10-13 21:00:00+02:00,0.826087,1.0,1.0,-0.038471,-0.914202,-0.912679,0.5,0.0,-1.000000,...,0.454545,0.2,-0.165464,0.047619,-0.303051,0.733435,-0.574492,-0.201324,-0.648521,-1.000000
670,2024-10-13 22:00:00+02:00,0.913043,1.0,1.0,-0.157271,-0.881684,-0.900313,0.5,0.0,-1.000000,...,0.454545,0.2,-0.165464,0.047619,-0.303051,0.733435,-0.474201,-0.078687,-0.498780,-1.000000


## Train-Test Split with Seasonal Balance

In [34]:
# Split data into train (22 days) and test (6 days) sets
# Test set must include at least 1 day from each season (Winter, Spring, Summer, Fall)

# Extract unique dates and their seasons
features_df_normalized['date'] = features_df_normalized['datetime'].dt.date
unique_dates = sorted(features_df_normalized['date'].unique())

# Group dates by season
# Season mapping: 0=Winter (Dec,Jan,Feb), 1=Spring (Mar,Apr,May), 2=Summer (Jun,Jul,Aug), 3=Fall (Sep,Oct,Nov)
def get_season_from_date(date):
    month = date.month
    if month in [12, 1, 2]:
        return 0  # Winter
    elif month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    else:
        return 3  # Fall

# Group dates by season
dates_by_season = {0: [], 1: [], 2: [], 3: []}
for date in unique_dates:
    season = get_season_from_date(date)
    dates_by_season[season].append(date)

print("Dates by season:")
season_names = {0: "Winter", 1: "Spring", 2: "Summer", 3: "Fall"}
for season, dates in dates_by_season.items():
    print(f"  {season_names[season]}: {len(dates)} days - {[str(d) for d in sorted(dates)]}")

# Select test days: 1 from each season (4 days) + 2 more = 6 days total
test_dates = []

# Select 1 day from each season (middle of week for each)
for season in [0, 1, 2, 3]:
    if dates_by_season[season]:
        # Take middle day of the week (index 3 out of 7, or closest)
        season_dates = sorted(dates_by_season[season])
        mid_idx = len(season_dates) // 2
        test_dates.append(season_dates[mid_idx])
        print(f"  Selected {season_names[season]} test day: {season_dates[mid_idx]}")

# Add 2 more days (one from Winter, one from Summer for balance)
if len(dates_by_season[0]) > 1:
    # Add another Winter day (skip the one already selected)
    winter_dates = sorted([d for d in dates_by_season[0] if d not in test_dates])
    if winter_dates:
        test_dates.append(winter_dates[len(winter_dates)//2])
        print(f"  Selected additional Winter test day: {winter_dates[len(winter_dates)//2]}")

if len(dates_by_season[2]) > 1:
    # Add another Summer day
    summer_dates = sorted([d for d in dates_by_season[2] if d not in test_dates])
    if summer_dates:
        test_dates.append(summer_dates[len(summer_dates)//2])
        print(f"  Selected additional Summer test day: {summer_dates[len(summer_dates)//2]}")

test_dates = sorted(test_dates)
print(f"\n✓ Selected {len(test_dates)} test days: {[str(d) for d in test_dates]}")

# Verify each season has at least 1 day in test set
test_seasons = [get_season_from_date(d) for d in test_dates]
for season in [0, 1, 2, 3]:
    count = test_seasons.count(season)
    if count == 0:
        print(f"⚠️  WARNING: {season_names[season]} has no test days!")
    else:
        print(f"  ✓ {season_names[season]}: {count} test day(s)")

# Split the dataframe
train_df = features_df_normalized[~features_df_normalized['date'].isin(test_dates)].copy()
test_df = features_df_normalized[features_df_normalized['date'].isin(test_dates)].copy()

# Remove date column (not needed in final dataset)
train_df = train_df.drop(columns=['date'])
test_df = test_df.drop(columns=['date'])

# Verify split
train_dates = sorted(features_df_normalized[~features_df_normalized['date'].isin(test_dates)]['date'].unique())
print(f"\n{'='*80}")
print(f"SPLIT SUMMARY")
print(f"{'='*80}")
print(f"Train set: {len(train_dates)} days × 24 hours = {len(train_df)} rows")
print(f"Test set:  {len(test_dates)} days × 24 hours = {len(test_df)} rows")
print(f"Total:     {len(unique_dates)} days × 24 hours = {len(features_df_normalized)} rows")
print(f"\nTrain dates: {[str(d) for d in train_dates]}")
print(f"Test dates:  {[str(d) for d in test_dates]}")

# Save train and test sets
train_path = Path('../data/processed/training_features_normalized_train.parquet')
test_path = Path('../data/processed/training_features_normalized_test.parquet')

train_path.parent.mkdir(parents=True, exist_ok=True)
test_path.parent.mkdir(parents=True, exist_ok=True)

train_df.to_parquet(train_path, index=False)
test_df.to_parquet(test_path, index=False)

print(f"\n{'='*80}")
print(f"FILES SAVED")
print(f"{'='*80}")
print(f"✓ Train set: {train_path}")
print(f"  Size: {train_path.stat().st_size / 1024:.2f} KB, Shape: {train_df.shape}")
print(f"✓ Test set:  {test_path}")
print(f"  Size: {test_path.stat().st_size / 1024:.2f} KB, Shape: {test_df.shape}")
print(f"\n✓ Train-test split complete with seasonal balance!")


Dates by season:
  Winter: 7 days - ['2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12', '2024-01-13', '2024-01-14']
  Spring: 7 days - ['2024-04-08', '2024-04-09', '2024-04-10', '2024-04-11', '2024-04-12', '2024-04-13', '2024-04-14']
  Summer: 7 days - ['2024-06-10', '2024-06-11', '2024-06-12', '2024-06-13', '2024-06-14', '2024-06-15', '2024-06-16']
  Fall: 7 days - ['2024-10-07', '2024-10-08', '2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13']
  Selected Winter test day: 2024-01-11
  Selected Spring test day: 2024-04-11
  Selected Summer test day: 2024-06-13
  Selected Fall test day: 2024-10-10
  Selected additional Winter test day: 2024-01-12
  Selected additional Summer test day: 2024-06-14

✓ Selected 6 test days: ['2024-01-11', '2024-01-12', '2024-04-11', '2024-06-13', '2024-06-14', '2024-10-10']
  ✓ Winter: 2 test day(s)
  ✓ Spring: 1 test day(s)
  ✓ Summer: 2 test day(s)
  ✓ Fall: 1 test day(s)

SPLIT SUMMARY
Train set: 22 days × 24 hours = 