# Feature Selection

## Requirements

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.svm import SVR

import lightgbm as lgb

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Data Setup

### Raw Data Setup

In [3]:
df = pd.read_csv('../data/raw/dataset_2019_2025.csv')
print(df.shape)
df.head()

(52608, 39)


Unnamed: 0,datetime,precipitation,cloud_cover,sunshine,temperature,relative_humidity,price,nuclear,hydro_run_of_river,biomass,fossil_brown_coal_/_lignite,fossil_hard_coal,fossil_oil,fossil_coal_derived_gas,fossil_gas,geothermal,hydro_water_reservoir,hydro_pumped_storage,others,waste,wind_offshore,wind_onshore,solar,load,residual_load,renewable_share_of_load,renewable_share_of_generation,austria_cbet,belgium_cbet,czech_republic_cbet,denmark_cbet,france_cbet,luxembourg_cbet,netherlands_cbet,norway_cbet,poland_cbet,sweden_cbet,switzerland_cbet,sum_cbet
0,2019-01-01 00:00:00+00:00,0.0,93.5,,7.25,87.25,10.07,8040.8,1752.9,4740.5,5414.9,2496.4,482.5,447.1,2876.2,16.6,118.6,327.7,382.8,1302.2,2568.5,22911.9,0.0,40567.8,15087.4,80.6,60.7,-4.264,,-0.71,-0.183,-3.642,-0.36,-1.176,,0.0,-0.076,-0.8,-11.211
1,2019-01-01 01:00:00+00:00,0.0,100.0,,7.175,85.0,-4.08,7562.4,1751.9,4724.0,5360.3,2355.4,482.7,477.2,2735.8,16.4,104.7,41.4,368.0,1289.8,2688.6,23856.0,0.0,39550.0,13005.4,85.3,62.7,-4.27,,-1.1,-0.307,-3.083,-0.35,-1.428,,0.0,-0.067,-0.8,-11.405
2,2019-01-01 02:00:00+00:00,0.025,100.0,0.0,6.875,87.75,-9.91,7079.6,1759.8,4723.7,5248.6,2381.4,489.9,509.9,2763.7,16.3,104.8,0.1,366.1,1294.0,2569.4,25655.5,0.0,39140.7,10915.8,90.5,64.5,-4.961,,-1.137,-0.412,-3.13,-0.344,-1.492,,0.0,-0.067,-0.8,-12.343
3,2019-01-01 03:00:00+00:00,0.05,100.0,0.0,6.625,90.5,-7.41,7117.7,1661.4,4731.7,5143.7,2407.6,490.7,523.6,2798.3,15.9,43.2,4.0,365.7,1303.5,2428.7,27414.8,0.1,38897.3,9053.7,94.9,65.4,-4.984,,-1.1,-0.676,-4.06,-0.36,-0.862,,0.0,-0.067,-0.8,-12.91
4,2019-01-01 04:00:00+00:00,0.1,100.0,0.0,6.45,90.5,-12.55,7027.8,1677.3,4729.2,5129.6,2239.6,492.0,484.4,2813.7,15.7,59.1,0.0,365.3,1299.9,1918.8,29040.5,0.1,37879.9,6920.5,100.4,66.4,-4.817,,-1.0,-1.441,-4.641,-0.336,-0.844,,0.0,-0.067,-0.8,-13.946


In [4]:
og_columns = [
    'precipitation', 'cloud_cover', 'temperature', 'relative_humidity',
    'load',
    'sum_cbet',
    'price'
]

df_temp = df[['datetime']+og_columns].set_index('datetime')

datetime = pd.to_datetime(df['datetime'], utc=True)

X = df[[
    'datetime',
    'precipitation', 'cloud_cover', 'temperature', 'relative_humidity',
    'load',
    'sum_cbet',
    'price'
]].set_index('datetime')
X = pd.concat([X[col].shift(lag).rename(f'{col}_{lag}') for col in X.columns for lag in range(24, 49)], axis=1).dropna()

y = df.set_index('datetime').loc[X.index, 'price'].to_frame()

print('---- Datetime index (`datetime`) ----')
display(datetime.head())
print(datetime.shape)

print('\n---- Feature matrix (`X`) ----')
display(X.head())
print(X.shape)

print('\n---- Target Array (`y`) ----')
display(y.head())
print(y.shape)

---- Datetime index (`datetime`) ----


0   2019-01-01 00:00:00+00:00
1   2019-01-01 01:00:00+00:00
2   2019-01-01 02:00:00+00:00
3   2019-01-01 03:00:00+00:00
4   2019-01-01 04:00:00+00:00
Name: datetime, dtype: datetime64[ns, UTC]

(52608,)

---- Feature matrix (`X`) ----


Unnamed: 0_level_0,precipitation_24,precipitation_25,precipitation_26,precipitation_27,precipitation_28,precipitation_29,precipitation_30,precipitation_31,precipitation_32,precipitation_33,precipitation_34,precipitation_35,precipitation_36,precipitation_37,precipitation_38,precipitation_39,precipitation_40,precipitation_41,precipitation_42,precipitation_43,precipitation_44,precipitation_45,precipitation_46,precipitation_47,precipitation_48,cloud_cover_24,cloud_cover_25,cloud_cover_26,cloud_cover_27,cloud_cover_28,cloud_cover_29,cloud_cover_30,cloud_cover_31,cloud_cover_32,cloud_cover_33,cloud_cover_34,cloud_cover_35,cloud_cover_36,cloud_cover_37,cloud_cover_38,cloud_cover_39,cloud_cover_40,cloud_cover_41,cloud_cover_42,cloud_cover_43,cloud_cover_44,cloud_cover_45,cloud_cover_46,cloud_cover_47,cloud_cover_48,...,sum_cbet_24,sum_cbet_25,sum_cbet_26,sum_cbet_27,sum_cbet_28,sum_cbet_29,sum_cbet_30,sum_cbet_31,sum_cbet_32,sum_cbet_33,sum_cbet_34,sum_cbet_35,sum_cbet_36,sum_cbet_37,sum_cbet_38,sum_cbet_39,sum_cbet_40,sum_cbet_41,sum_cbet_42,sum_cbet_43,sum_cbet_44,sum_cbet_45,sum_cbet_46,sum_cbet_47,sum_cbet_48,price_24,price_25,price_26,price_27,price_28,price_29,price_30,price_31,price_32,price_33,price_34,price_35,price_36,price_37,price_38,price_39,price_40,price_41,price_42,price_43,price_44,price_45,price_46,price_47,price_48
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
2019-01-03 00:00:00+00:00,0.0,0.175,0.175,0.025,0.075,0.025,0.05,0.0,0.0,0.0,0.2,0.225,0.075,0.075,0.05,0.075,0.025,0.0,0.15,0.325,0.1,0.05,0.025,0.0,0.0,56.0,56.0,81.0,87.0,75.0,87.5,84.25,71.5,62.25,87.25,84.25,96.75,87.5,87.5,96.75,96.75,81.25,96.75,100.0,100.0,100.0,100.0,100.0,100.0,93.5,...,-12.315,-11.83,-11.299,-12.443,-12.115,-13.425,-13.553,-13.421,-13.875,-14.83,-15.827,-15.877,-15.127,-14.214,-13.379,-13.935,-13.808,-13.43,-14.182,-14.598,-13.946,-12.91,-12.343,-11.405,-11.211,-45.92,-33.57,-28.93,-4.87,-24.93,-6.98,-4.97,0.07,9.06,1.97,-0.03,0.0,-0.02,0.12,0.45,-4.93,-6.33,-4.93,-15.07,-17.25,-12.55,-7.41,-9.91,-4.08,10.07
2019-01-03 01:00:00+00:00,0.0,0.0,0.175,0.175,0.025,0.075,0.025,0.05,0.0,0.0,0.0,0.2,0.225,0.075,0.075,0.05,0.075,0.025,0.0,0.15,0.325,0.1,0.05,0.025,0.0,49.5,56.0,56.0,81.0,87.0,75.0,87.5,84.25,71.5,62.25,87.25,84.25,96.75,87.5,87.5,96.75,96.75,81.25,96.75,100.0,100.0,100.0,100.0,100.0,100.0,...,-12.402,-12.315,-11.83,-11.299,-12.443,-12.115,-13.425,-13.553,-13.421,-13.875,-14.83,-15.827,-15.877,-15.127,-14.214,-13.379,-13.935,-13.808,-13.43,-14.182,-14.598,-13.946,-12.91,-12.343,-11.405,-48.29,-45.92,-33.57,-28.93,-4.87,-24.93,-6.98,-4.97,0.07,9.06,1.97,-0.03,0.0,-0.02,0.12,0.45,-4.93,-6.33,-4.93,-15.07,-17.25,-12.55,-7.41,-9.91,-4.08
2019-01-03 02:00:00+00:00,0.0,0.0,0.0,0.175,0.175,0.025,0.075,0.025,0.05,0.0,0.0,0.0,0.2,0.225,0.075,0.075,0.05,0.075,0.025,0.0,0.15,0.325,0.1,0.05,0.025,59.0,49.5,56.0,56.0,81.0,87.0,75.0,87.5,84.25,71.5,62.25,87.25,84.25,96.75,87.5,87.5,96.75,96.75,81.25,96.75,100.0,100.0,100.0,100.0,100.0,...,-11.896,-12.402,-12.315,-11.83,-11.299,-12.443,-12.115,-13.425,-13.553,-13.421,-13.875,-14.83,-15.827,-15.877,-15.127,-14.214,-13.379,-13.935,-13.808,-13.43,-14.182,-14.598,-13.946,-12.91,-12.343,-44.99,-48.29,-45.92,-33.57,-28.93,-4.87,-24.93,-6.98,-4.97,0.07,9.06,1.97,-0.03,0.0,-0.02,0.12,0.45,-4.93,-6.33,-4.93,-15.07,-17.25,-12.55,-7.41,-9.91
2019-01-03 03:00:00+00:00,0.0,0.0,0.0,0.0,0.175,0.175,0.025,0.075,0.025,0.05,0.0,0.0,0.0,0.2,0.225,0.075,0.075,0.05,0.075,0.025,0.0,0.15,0.325,0.1,0.05,53.0,59.0,49.5,56.0,56.0,81.0,87.0,75.0,87.5,84.25,71.5,62.25,87.25,84.25,96.75,87.5,87.5,96.75,96.75,81.25,96.75,100.0,100.0,100.0,100.0,...,-12.147,-11.896,-12.402,-12.315,-11.83,-11.299,-12.443,-12.115,-13.425,-13.553,-13.421,-13.875,-14.83,-15.827,-15.877,-15.127,-14.214,-13.379,-13.935,-13.808,-13.43,-14.182,-14.598,-13.946,-12.91,-48.93,-44.99,-48.29,-45.92,-33.57,-28.93,-4.87,-24.93,-6.98,-4.97,0.07,9.06,1.97,-0.03,0.0,-0.02,0.12,0.45,-4.93,-6.33,-4.93,-15.07,-17.25,-12.55,-7.41
2019-01-03 04:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.175,0.175,0.025,0.075,0.025,0.05,0.0,0.0,0.0,0.2,0.225,0.075,0.075,0.05,0.075,0.025,0.0,0.15,0.325,0.1,62.25,53.0,59.0,49.5,56.0,56.0,81.0,87.0,75.0,87.5,84.25,71.5,62.25,87.25,84.25,96.75,87.5,87.5,96.75,96.75,81.25,96.75,100.0,100.0,100.0,...,-11.743,-12.147,-11.896,-12.402,-12.315,-11.83,-11.299,-12.443,-12.115,-13.425,-13.553,-13.421,-13.875,-14.83,-15.827,-15.877,-15.127,-14.214,-13.379,-13.935,-13.808,-13.43,-14.182,-14.598,-13.946,-29.91,-48.93,-44.99,-48.29,-45.92,-33.57,-28.93,-4.87,-24.93,-6.98,-4.97,0.07,9.06,1.97,-0.03,0.0,-0.02,0.12,0.45,-4.93,-6.33,-4.93,-15.07,-17.25,-12.55


(52560, 175)

---- Target Array (`y`) ----


Unnamed: 0_level_0,price
datetime,Unnamed: 1_level_1
2019-01-03 00:00:00+00:00,45.22
2019-01-03 01:00:00+00:00,45.63
2019-01-03 02:00:00+00:00,44.0
2019-01-03 03:00:00+00:00,43.88
2019-01-03 04:00:00+00:00,45.92


(52560, 1)


### Train-Test Split

In [5]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

train_size = int(len(X_train_val) * (8/9))  # 8/9 because validation needs to be 10% of total dataset & train_val set is 90% of total dataset
tscv_rfecv = TimeSeriesSplit(n_splits=2, max_train_size=train_size)

## Recursive Feature Selection

In [6]:
%%time

rfecv = RFECV(
    estimator=lgb.LGBMRegressor(n_jobs=-1, verbose=-1),
    cv=tscv_rfecv,
    step=2,
    n_jobs=-1
)
rfecv.fit(X_train_val, y_train_val.to_numpy().flatten())

rfecv_features = rfecv.feature_names_in_[rfecv.support_]
X = X[rfecv_features]



CPU times: user 4min 44s, sys: 1min 23s, total: 6min 8s
Wall time: 5min 3s


## Reconcatenation

In [7]:
df_processed = y.merge(
    X,
    left_index=True, right_index=True
)
print(df_processed.shape)
df_processed.head()

(52560, 30)


Unnamed: 0_level_0,price,temperature_24,temperature_28,temperature_33,temperature_40,temperature_46,relative_humidity_29,relative_humidity_36,relative_humidity_48,load_24,load_28,load_30,load_34,load_36,load_42,load_48,sum_cbet_24,sum_cbet_28,sum_cbet_32,sum_cbet_36,sum_cbet_43,sum_cbet_48,price_24,price_31,price_37,price_41,price_43,price_45,price_47,price_48
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2019-01-03 00:00:00+00:00,45.22,3.6,4.45,5.775,6.2,6.875,72.5,77.25,87.25,41999.3,49667.4,52233.4,49527.7,48690.7,39151.5,40567.8,-12.315,-12.115,-13.875,-15.127,-14.598,-11.211,-45.92,0.07,0.12,-4.93,-17.25,-7.41,-4.08,10.07
2019-01-03 01:00:00+00:00,45.63,3.575,4.55,5.4,6.575,6.625,75.75,80.25,85.0,41813.9,47096.3,50141.2,53251.4,48874.6,41058.6,39550.0,-12.402,-12.443,-13.421,-15.877,-14.182,-11.405,-48.29,-4.97,-0.02,-6.33,-15.07,-12.55,-9.91,-4.08
2019-01-03 02:00:00+00:00,44.0,3.225,4.225,4.9,6.8,6.45,74.75,79.5,87.75,43015.6,44543.5,49667.4,54680.7,49527.7,43925.3,39140.7,-11.896,-11.299,-13.553,-15.827,-13.43,-12.343,-44.99,-6.98,0.0,-4.93,-4.93,-17.25,-7.41,-9.91
2019-01-03 03:00:00+00:00,43.88,3.15,3.925,4.9,6.75,6.45,77.25,77.0,90.5,45367.2,42768.9,47096.3,54425.7,53251.4,46800.1,38897.3,-12.147,-11.83,-13.425,-14.83,-13.808,-12.91,-48.93,-24.93,-0.03,0.45,-6.33,-15.07,-12.55,-7.41
2019-01-03 04:00:00+00:00,45.92,2.925,3.6,4.85,6.875,6.525,77.0,76.25,90.5,50485.4,41999.3,44543.5,52233.4,54680.7,48938.6,37879.9,-11.743,-12.315,-12.115,-13.875,-13.935,-13.946,-29.91,-4.87,1.97,0.12,-4.93,-4.93,-17.25,-12.55


In [8]:
df_processed.to_csv('../data/processed/rfe_dataset_2019_2025.csv')