<a href="https://colab.research.google.com/github/souhirbenamor/EPF/blob/main/2025_Random_Forest_Bridging_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All Experiments

In [None]:
import os
import random

# --- Reproducibility ---
os.environ['PYTHONHASHSEED'] = '0'
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

# Fix seeds
random.seed(42)
np.random.seed(42)

# -------------------------------------------------------------
# 1. Load & Preprocess Data
# -------------------------------------------------------------
DATA_PATH = 'EPF_2015_2020.xlsx'
df = pd.read_excel(DATA_PATH)
df['Date'] = pd.to_datetime(df['Date'])
df = df.drop_duplicates(subset='Date').set_index('Date').sort_index()
df = df.loc['2015-01-01':]

# -------------------------------------------------------------
# 2. Split by Year
# -------------------------------------------------------------
years = sorted(df.index.year.unique())
train_years = [2015, 2016, 2017]
val_years   = [2018]
test_years  = [y for y in years if y not in train_years + val_years]

train_df     = df[df.index.year.isin(train_years)]
val_df       = df[df.index.year.isin(val_years)]
test_df      = df[df.index.year.isin(test_years)]
train_val_df = pd.concat([train_df, val_df]).sort_index()

target = 'Price'

# -------------------------------------------------------------
# 3. Experiment Loop
# -------------------------------------------------------------
experiments = [
    (['Demand Day-ahead DE', 'Wind and PV Day ahead (MWh/h)', 'Gas', 'Coal', 'CO2'], 'RF'),
    (['MCP','Demand Day-ahead DE', 'Wind and PV Day ahead (MWh/h)', 'Gas', 'Coal', 'CO2'], 'RF_ESM+'),
    (['MCP'], 'RF_ESM')
]

param_grid = {
    'bootstrap':       [True],
    'max_depth':       [10, 250],
    'min_samples_leaf':[2, 5],
    'n_estimators':    [100, 200]
}

tscv = TimeSeriesSplit(n_splits=3)

all_forecasts = pd.DataFrame()
metrics = []

for feats, label in experiments:
    # Prepare data
    X_train = train_val_df[feats]
    y_train = train_val_df[target]
    X_test  = test_df[feats]
    y_test  = test_df[target]

    # Grid search CV
    rf = RandomForestRegressor(random_state=42)
    grid = GridSearchCV(rf, param_grid, cv=tscv, verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    # Final model
    best_params = grid.best_params_
    rf_final = RandomForestRegressor(**best_params,
                                     criterion='absolute_error',
                                     random_state=42)
    rf_final.fit(X_train, y_train)

    # Predict
    if X_test.isna().any().any():
        X_test = X_test.fillna(method='ffill').fillna(method='bfill')
    y_pred = rf_final.predict(X_test)
    df_pred = pd.DataFrame({label: y_pred}, index=X_test.index)

    # Collect forecasts
    all_forecasts = all_forecasts.join(df_pred, how='outer')

    # Metrics
    rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    mae  = mean_absolute_error(y_test, y_pred)
    metrics.append({'Label': label, 'RMSE': rmse, 'MAE': mae})

# -------------------------------------------------------------
# 4. Save Combined Results
# -------------------------------------------------------------
metrics_df = pd.DataFrame(metrics).set_index('Label')
with pd.ExcelWriter('Combined_RF_Experiments 2019-2020.xlsx') as writer:
    all_forecasts.to_excel(writer, sheet_name='Forecasts')
    metrics_df.to_excel(writer, sheet_name='Metrics')

print("Done! Saved Combined_RF_Experiments.xlsx with RF, RF_ESM+, RF_ESM")




Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=5, n_estimators=100; total time=   7.6s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, n_estimators=100; total time=  10.1s
[CV] END bootstrap=True, max_depth=250, min_samples_leaf=5, n_estimators=100; total time=  10.0s
[CV] END bootstrap=True, max_depth=250, min_samples_leaf=5, n_estimators=200; total time=  11.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, n_estimators=200; total time=  13.6s
[CV] END bootstrap=True, max_depth=250, min_samples_leaf=2, n_estimators=100; total time=  13.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, n_estimators=100; total time=  14.5s
[CV] END bootstrap=True, max_depth=250, min_samples_leaf=5, n_estimators=100; total time=  15.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=5, n_estimators=200; total time=  17.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=5, n_estimators=