
# 03 — Modeling Tabular

## Baselines + RNNs (LSTM, GRU, Dilated, Clockwork) con datos tabulados.

**Optimización Bayesiana**

## Setup

In [1]:
from pathlib import Path
import os, json, math, time, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

## Config

In [2]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

Device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
DATA_CLEAN = Path("../data/clean/base_dataset.csv")
OUT_DIR = Path("../outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR = OUT_DIR / "artifacts"; ART_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR = OUT_DIR / "figures"; FIG_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "GHI"
FREQ = "10T"
INPUT_STEPS   = 36   # 6h pasado
HORIZON_STEPS = 6    # 1h adelante
BATCH_SIZE    = 256
EPOCHS        = 40
PATIENCE      = 6 # Early stopping patience

## Data

In [4]:
df = pd.read_csv(DATA_CLEAN, parse_dates=[0], index_col=0)
df.index.name = "time"
df = df.sort_index()

print(df.shape, df.index.min(), "→", df.index.max())
df.head()

(107172, 56) 2022-02-21 18:00:00+00:00 → 2024-03-06 23:50:00+00:00


Unnamed: 0_level_0,CSI,GHI,Presion,TempAmb,Wind Y,Wind X,DoY Sin,DoY Cos,horas,__missing_target,...,solar_elevation,ETR,clear_sky_ghi,CSI_advanced,ghi_1min_change,ghi_5min_std,ghi_persistence_1h,temp_pressure_ratio,wind_temp_interaction,wind_cloud_effect
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-21 18:00:00+00:00,2.0,3.0352,1000.7912,29.9672,2.832954,-0.093612,0.778764,0.627317,18,False,...,66.355849,1397.379435,1280.074962,0.002371,,,,0.029944,84.942028,0.006721
2022-02-21 18:10:00+00:00,0.0,0.3562,1000.9321,29.5689,3.387552,0.796801,0.778764,0.627317,18,False,...,64.849089,1397.379435,1264.896009,0.000282,2.679,,,0.029541,102.899772,0.00098
2022-02-21 18:20:00+00:00,0.0,0.0,1001.1479,29.2593,2.091197,-0.87868,0.778764,0.627317,18,False,...,63.19567,1397.379435,1247.233445,0.0,0.3562,,,0.029226,66.36887,0.0
2022-02-21 18:30:00+00:00,0.0,0.0,1001.2992,28.9183,-0.487957,-1.478562,0.778764,0.627317,18,False,...,61.421038,1397.379435,1227.120886,0.0,0.0,,,0.028881,45.025793,0.0
2022-02-21 18:40:00+00:00,0.0,0.0,1001.4676,28.5578,0.891171,-2.047462,0.778764,0.627317,18,False,...,59.546382,1397.379435,1204.59661,0.0,0.0,1.326556,,0.028516,63.769567,0.0


In [5]:
df.columns

Index(['CSI', 'GHI', 'Presion', 'TempAmb', 'Wind Y', 'Wind X', 'DoY Sin',
       'DoY Cos', 'horas', '__missing_target', 'flag_GHI_range',
       'flag_TempAmb_range', 'flag_Presion_range', 'flag_CSI_range', 'hour',
       'dow', 'month', 'minute', 'is_weekend', 'hour_sin', 'hour_cos',
       'WindSpeed', 'WindDirection', 'GHI_roll1h_mean', 'GHI_roll3h_mean',
       'GHI_roll6h_mean', 'GHI_roll1h_max', 'TempAmb_roll1h_mean',
       'TempAmb_roll3h_mean', 'TempAmb_roll6h_mean', 'TempAmb_roll1h_max',
       'Presion_roll1h_mean', 'Presion_roll3h_mean', 'Presion_roll6h_mean',
       'Presion_roll1h_max', 'WindSpeed_roll1h_mean', 'WindSpeed_roll3h_mean',
       'WindSpeed_roll6h_mean', 'WindSpeed_roll1h_max', 'GHI_lag1', 'GHI_lag3',
       'GHI_lag6', 'GHI_lag12', 'GHI_lag36', 'solar_zenith', 'solar_azimuth',
       'solar_elevation', 'ETR', 'clear_sky_ghi', 'CSI_advanced',
       'ghi_1min_change', 'ghi_5min_std', 'ghi_persistence_1h',
       'temp_pressure_ratio', 'wind_temp_interaction'

In [6]:
feat_cols = [
    'Presion', 'TempAmb', 'WindSpeed', 'WindDirection',
    'hour_sin', 'hour_cos', 'DoY Sin', 'DoY Cos', 'is_weekend',
    'solar_zenith', 'solar_azimuth', 'solar_elevation',
    'TempAmb_roll1h_mean','TempAmb_roll6h_mean',
    'Presion_roll1h_mean','Presion_roll6h_mean',
    'WindSpeed_roll1h_mean','WindSpeed_roll6h_mean',
    'ghi_5min_std','wind_temp_interaction'
]

## Split

In [7]:
# from sklearn.impute import SimpleImputer

# # Configuración de imputers
# X_imputer = SimpleImputer(strategy='mean')
# y_imputer = SimpleImputer(strategy='mean')

# # Imputar valores NaN
# X_train_imp = X_imputer.fit_transform(X_train)
# y_train_imp = y_imputer.fit_transform(y_train.reshape(-1, 1)).ravel()
# X_val_imp = X_imputer.transform(X_val)
# X_test_imp = X_imputer.transform(X_test)

# # Verificar que no hay NaN
# print("NaN después de imputación:")
# print("X_train:", np.isnan(X_train_imp).sum())
# print("y_train:", np.isnan(y_train_imp).sum())
# print("X_val:", np.isnan(X_val_imp).sum())
# print("X_test:", np.isnan(X_test_imp).sum())

# # Usar datos imputados en lugar de los originales
# X_train_clean, y_train_clean = X_train_imp, y_train_imp
# X_val_clean, X_test_clean = X_val_imp, X_test_imp

In [8]:
n = len(df)
i_tr = int(0.7*n); i_va = int(0.85*n)
df_train, df_val, df_test = df.iloc[:i_tr], df.iloc[i_tr:i_va], df.iloc[i_va:]

X_scaler, y_scaler = StandardScaler(), StandardScaler()
X_train = X_scaler.fit_transform(df_train[feat_cols].values)
y_train = y_scaler.fit_transform(df_train[[TARGET_COL]].values).ravel()
X_val   = X_scaler.transform(df_val[feat_cols].values)
y_val   = y_scaler.transform(df_val[[TARGET_COL]].values).ravel()
X_test  = X_scaler.transform(df_test[feat_cols].values)
y_test  = y_scaler.transform(df_test[[TARGET_COL]].values).ravel()

In [9]:
# Verificar valores NaN antes del split
print("Valores NaN en X_train:", np.isnan(X_train).sum())
print("Valores NaN en y_train:", np.isnan(y_train).sum())
print("Valores NaN en X_val:", np.isnan(X_val).sum())
print("Valores NaN en X_test:", np.isnan(X_test).sum())

nan_mask = np.isnan(X_train).any(axis=1) | np.isnan(y_train)
X_train_clean = X_train[~nan_mask]
y_train_clean = y_train[~nan_mask]

print(f"Eliminadas {nan_mask.sum()} filas con NaN de entrenamiento")

Valores NaN en X_train: 61
Valores NaN en y_train: 0
Valores NaN en X_val: 0
Valores NaN en X_test: 0
Eliminadas 17 filas con NaN de entrenamiento


## Baselines

In [10]:
def metrics_from_scaled(pred_scaled, true_scaled, y_scaler):
    """Calculate metrics from scaled predictions and true values"""
    p = y_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).ravel()
    t = y_scaler.inverse_transform(true_scaled.reshape(-1, 1)).ravel()
    mae = mean_absolute_error(t, p)
    rmse = math.sqrt(mean_squared_error(t, p))
    mape = np.mean(np.abs((t + 1e-6) - p) / (np.abs(t) + 1e-6)) * 100
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape}, (t, p)

# Linear
lin = LinearRegression().fit(X_train_clean, y_train_clean)
lin_metrics, (y_true_lin, y_pred_lin) = metrics_from_scaled(lin.predict(X_test), y_test, y_scaler)

# RF (baseline fijo)
rf0 = RandomForestRegressor(n_estimators=300, random_state=SEED, n_jobs=-1).fit(X_train_clean, y_train_clean)
rf0_metrics, (y_true_rf0, y_pred_rf0) = metrics_from_scaled(rf0.predict(X_test), y_test, y_scaler)
print("Linear:", lin_metrics, "\nRF baseline:", rf0_metrics)

Linear: {'MAE': 128.58392354897063, 'RMSE': 168.46142137925324, 'MAPE': np.float64(2041760184.8979485)} 
RF baseline: {'MAE': 37.016858900378246, 'RMSE': 79.77865583750265, 'MAPE': np.float64(110234072.6442202)}


## Sequentials

In [11]:
class SeqDataset(Dataset):
    def __init__(self, X, y, input_steps=36, horizon=6):
        self.X, self.y = X, y
        self.input_steps, self.horizon = input_steps, horizon
        self.max_i = len(X) - input_steps - horizon
        assert self.max_i > 0, "No hay suficientes muestras."
    def __len__(self): return self.max_i
    def __getitem__(self, idx):
        i0, i1 = idx, idx + self.input_steps
        ih = i1 + self.horizon - 1
        return (torch.tensor(self.X[i0:i1], dtype=torch.float32),
                torch.tensor(self.y[ih], dtype=torch.float32))

def make_loaders(X_tr, y_tr, X_va, y_va, X_te, y_te, steps, horizon, batch=256):
    ds_tr = SeqDataset(X_tr, y_tr, steps, horizon)
    ds_va = SeqDataset(X_va, y_va, steps, horizon)
    ds_te = SeqDataset(X_te, y_te, steps, horizon)
    return (DataLoader(ds_tr, batch_size=batch, shuffle=True, drop_last=True),
            DataLoader(ds_va, batch_size=batch, shuffle=False, drop_last=False),
            DataLoader(ds_te, batch_size=batch, shuffle=False, drop_last=False))

dl_train, dl_val, dl_test = make_loaders(X_train, y_train, X_val, y_val, X_test, y_test,
                                         INPUT_STEPS, HORIZON_STEPS, BATCH_SIZE)

### PyTorch

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, in_dim, hidden=64, num_layers=1, dropout=0.0, bidirectional=False):
        super().__init__()
        self.rnn = nn.LSTM(in_dim, hidden, num_layers=num_layers, batch_first=True,
                           dropout=(dropout if num_layers>1 else 0.0), bidirectional=bidirectional)
        out_dim = hidden * (2 if bidirectional else 1)
        self.fc = nn.Linear(out_dim, 1)
    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :]).squeeze(1)

class GRUModel(nn.Module):
    def __init__(self, in_dim, hidden=64, num_layers=1, dropout=0.0, bidirectional=False):
        super().__init__()
        self.rnn = nn.GRU(in_dim, hidden, num_layers=num_layers, batch_first=True,
                          dropout=(dropout if num_layers>1 else 0.0), bidirectional=bidirectional)
        out_dim = hidden * (2 if bidirectional else 1)
        self.fc = nn.Linear(out_dim, 1)
    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :]).squeeze(1)

### Training

In [13]:
def train_torch_model(model, dl_train, dl_val, epochs=40, lr=1e-3, patience=6, device=DEVICE, trial=None):
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    best_val, best_state, no_improve = float("inf"), None, 0
    history = {"train": [], "val": []}
    
    for ep in range(1, epochs+1):
        # Training
        model.train()
        train_losses = []
        for xb, yb in dl_train:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()
            train_losses.append(loss.item())
        
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in dl_val:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                val_losses.append(loss_fn(pred, yb).item())
        
        train_mse = np.mean(train_losses)
        val_mse = np.mean(val_losses)
        history["train"].append(train_mse)
        history["val"].append(val_mse)

        if trial is not None:
            trial.report(val_mse, ep)
            if trial.should_prune():
                raise optuna.TrialPruned()

        if val_mse < best_val - 1e-6:
            best_val, best_state, no_improve = val_mse, model.state_dict(), 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_val, history

def eval_sequence_model(model, dl, y_scaler, device=DEVICE):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy())
            trues.append(yb.cpu().numpy())
    p = np.concatenate(preds); t = np.concatenate(trues)
    p_o = y_scaler.inverse_transform(p.reshape(-1,1)).ravel()
    t_o = y_scaler.inverse_transform(t.reshape(-1,1)).ravel()
    mae  = mean_absolute_error(t_o, p_o)
    rmse = math.sqrt(mean_squared_error(t_o, p_o))
    mape = np.mean(np.abs((t_o + 1e-6) - p_o) / (np.abs(t_o) + 1e-6)) * 100
    return {"MAE":mae, "RMSE":rmse, "MAPE":mape}, (t_o, p_o)


## Optuna

#### RF

In [14]:
def objective_rf(trial: optuna.Trial):
    rf = RandomForestRegressor(
        n_estimators      = trial.suggest_int("n_estimators", 200, 700),
        max_depth         = trial.suggest_int("max_depth", 6, 28),
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20),
        min_samples_leaf  = trial.suggest_int("min_samples_leaf", 1, 10),
        n_jobs=-1, random_state=SEED
    )
    rf.fit(X_train, y_train)
    pred_val = rf.predict(X_val)
    p_o = y_scaler.inverse_transform(pred_val.reshape(-1,1)).ravel()
    t_o = y_scaler.inverse_transform(y_val.reshape(-1,1)).ravel()
    return math.sqrt(mean_squared_error(t_o, p_o))

study_rf = optuna.create_study(direction="minimize", study_name="RF_RMSE")
study_rf.optimize(objective_rf, n_trials=30)

best_rf = RandomForestRegressor(random_state=SEED, n_jobs=-1, **study_rf.best_trial.params)
best_rf.fit(np.vstack([X_train, X_val]), np.concatenate([y_train, y_val]))
rf_pred = best_rf.predict(X_test)
rf_metrics, (y_true_rf, y_pred_rf) = metrics_from_scaled(rf_pred, y_test, y_scaler)
print("RF (Optuna):", rf_metrics)

[W 2025-09-19 09:11:01,556] Trial 1 failed with parameters: {'n_estimators': 697, 'max_depth': 24, 'min_samples_split': 16, 'min_samples_leaf': 6} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Esteban\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Esteban\AppData\Local\Temp\ipykernel_26372\2799514303.py", line 9, in objective_rf
    rf.fit(X_train, y_train)
  File "c:\Users\Esteban\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Esteban\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_forest.py", line 486, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "c:\Users\Esteban\AppData\Loc

KeyboardInterrupt: 

#### RNN

In [None]:
def make_loaders_from_arrays(X_tr, y_tr, X_va, y_va, steps, horizon, batch=256):
    """Create train and validation loaders from arrays"""
    ds_tr = SeqDataset(X_tr, y_tr, steps, horizon)
    ds_va = SeqDataset(X_va, y_va, steps, horizon)
    return (DataLoader(ds_tr, batch_size=batch, shuffle=True, drop_last=True),
            DataLoader(ds_va, batch_size=batch, shuffle=False, drop_last=False))

def objective_rnn(model_kind="LSTM"):
    def _obj(trial: optuna.Trial):
        hidden = trial.suggest_int("hidden", 32, 160, step=32)
        layers = trial.suggest_int("num_layers", 1, 3)
        dropout = trial.suggest_float("dropout", 0.0, 0.4)
        bidir   = trial.suggest_categorical("bidirectional", [False, True])
        lr      = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
        steps   = trial.suggest_categorical("input_steps", [24, 36, 48])
        horizon = trial.suggest_categorical("horizon_steps", [3, 6, 12])
        batch   = trial.suggest_categorical("batch", [128, 256, 512])

        dl_tr, dl_va = make_loaders_from_arrays(X_train, y_train, X_val, y_val, steps, horizon, batch=batch)

        in_dim = X_train.shape[1]
        if model_kind == "LSTM":
            model = LSTMModel(in_dim, hidden=hidden, num_layers=layers, dropout=dropout, bidirectional=bidir)
        else:
            model = GRUModel(in_dim, hidden=hidden, num_layers=layers, dropout=dropout, bidirectional=bidir)

        _, best_val, _ = train_torch_model(model, dl_tr, dl_va,
                                          epochs=EPOCHS, lr=lr, patience=PATIENCE,
                                          device=DEVICE, trial=trial)
        return best_val
    return _obj

study_lstm = optuna.create_study(direction="minimize", study_name="LSTM_MSEval")
study_lstm.optimize(objective_rnn("LSTM"), n_trials=30)

study_gru = optuna.create_study(direction="minimize", study_name="GRU_MSEval")
study_gru.optimize(objective_rnn("GRU"), n_trials=30)

print("Best LSTM:", study_lstm.best_trial.params)
print("Best GRU :", study_gru.best_trial.params)

## Best Model

In [None]:
def build_best(model_kind, params, in_dim):
    """Build the best model from optimized parameters"""
    model_cls = LSTMModel if model_kind == "LSTM" else GRUModel
    return model_cls(in_dim,
                     hidden=params["hidden"],
                     num_layers=params["num_layers"],
                     dropout=params["dropout"],
                     bidirectional=params["bidirectional"])

def retrain_and_test(model_kind, best_params):
    steps, horizon, batch, lr = best_params["input_steps"], best_params["horizon_steps"], best_params["batch"], best_params["lr"]
    X_trv = np.vstack([X_train, X_val]); y_trv = np.concatenate([y_train, y_val])
    dl_tr, dl_va, dl_te = make_loaders(X_trv, y_trv, X_val, y_val, X_test, y_test, steps, horizon, batch)
    model = build_best(model_kind, best_params, in_dim=X_train.shape[1])
    model, best_val, history = train_torch_model(model, dl_tr, dl_va, epochs=EPOCHS, lr=lr, patience=PATIENCE, device=DEVICE)
    
    # Save best model
    torch.save(model.state_dict(), ART_DIR / f"best_{model_kind.lower()}_model.pt")
    
    return eval_sequence_model(model, dl_te, y_scaler), history

lstm_metrics, (yt_lstm, yp_lstm) = retrain_and_test("LSTM", study_lstm.best_trial.params)[0]
gru_metrics,  (yt_gru,  yp_gru)  = retrain_and_test("GRU",  study_gru.best_trial.params)[0]

print("LSTM (Optuna):", lstm_metrics)
print("GRU  (Optuna):", gru_metrics)

## Results

In [None]:
results = {
    "LinearRegression": lin_metrics,
    "RandomForest_baseline": rf0_metrics,
    "RandomForest_Optuna": rf_metrics,
    "LSTM_Optuna": lstm_metrics,
    "GRU_Optuna":  gru_metrics
}
res_df = pd.DataFrame(results).T.sort_values("RMSE")
display(res_df.round(3))

# with open(ART_DIR/"tabular_results_optuna.json","w") as f:
#     json.dump({k:{m:float(vv) for m,vv in v.items()} for k,v in results.items()}, f, indent=2)
# print("Saved:", ART_DIR/"tabular_results_optuna.json")

## Visualization

In [None]:
# Plot optimization history
fig = optuna.visualization.plot_optimization_history(study_lstm)
fig.show()

# Plot parameter importances
fig = optuna.visualization.plot_param_importances(study_lstm)
fig.show()