In [16]:
# %% Imports y paths
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

DATA_DIR = Path("../data_processed")
OUT_DIR  = Path("../models")
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PQ = DATA_DIR / "ground_train_h6.parquet"
VAL_PQ   = DATA_DIR / "ground_val_h6.parquet"
TEST_PQ  = DATA_DIR / "ground_test_h6.parquet"
TARGET   = "y_k_h6" 
SEED     = 42

In [17]:
# %% Carga y chequeos básicos
train = pd.read_parquet(TRAIN_PQ)
val   = pd.read_parquet(VAL_PQ)
test  = pd.read_parquet(TEST_PQ)

print("train:", train.shape, "val:", val.shape, "test:", test.shape)

assert TARGET in train.columns, f"Target '{TARGET}' no está en train."
assert set(train.columns) == set(val.columns) == set(test.columns), "Columnas no coinciden entre splits."

# Si hay NaN, los removemos (simple para baseline)
train = train.dropna(subset=[TARGET]).dropna()
val   = val.dropna(subset=[TARGET]).dropna()
test  = test.dropna(subset=[TARGET]).dropna()

print("Post dropna -> train:", train.shape, "val:", val.shape, "test:", test.shape)


train: (57789, 41) val: (12384, 41) test: (12384, 41)
Post dropna -> train: (57789, 41) val: (12384, 41) test: (12384, 41)


In [18]:
# %% Selección de features y target
feat_cols = [c for c in train.columns if c != TARGET]
Xtr, ytr = train[feat_cols], train[TARGET]
Xva, yva = val[feat_cols],   val[TARGET]
Xte, yte = test[feat_cols],  test[TARGET]

print("n_features:", len(feat_cols))


n_features: 40


In [19]:
# %% Métricas helper
def rmse(y_true, y_pred):
    return root_mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = np.abs(y_true) > eps
    return (np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]).mean()) * 100 if mask.any() else np.nan


In [20]:
# %% Baseline 1: mediana (constante)
yhat_val_med = np.full_like(yva, fill_value=np.median(ytr), dtype=float)
yhat_te_med  = np.full_like(yte, fill_value=np.median(ytr), dtype=float)

print("[Baseline: median]  val -> RMSE:", rmse(yva, yhat_val_med), "MAE:", mae(yva, yhat_val_med))
print("[Baseline: median]  test -> RMSE:", rmse(yte, yhat_te_med),  "MAE:", mae(yte, yhat_te_med))


[Baseline: median]  val -> RMSE: 326997.76206627977 MAE: 48908.462178305905
[Baseline: median]  test -> RMSE: 268729.7606465801 MAE: 42850.24269059426


In [21]:
# %% Baseline 2: persistencia (si existe 'k_ghi_lag1')
persist_col = "k_ghi_lag1"
if persist_col in feat_cols:
    yhat_val_pers = Xva[persist_col].values
    yhat_te_pers  = Xte[persist_col].values
    print("[Baseline: persistence] val -> RMSE:", rmse(yva, yhat_val_pers), "MAE:", mae(yva, yhat_val_pers))
    print("[Baseline: persistence] test -> RMSE:", rmse(yte, yhat_te_pers),  "MAE:", mae(yte, yhat_te_pers))
else:
    print("No hay columna de persistencia ('k_ghi_lag1'); se omite baseline de persistencia.")


[Baseline: persistence] val -> RMSE: 449936.6021853133 MAE: 90276.15394551976
[Baseline: persistence] test -> RMSE: 369755.6499052737 MAE: 77893.67098874913


In [22]:
# %% Modelo 1: Ridge lineal (rápido, con estandarización)
ridge = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),  # with_mean=False por si hay muchas features esparsas; seguro con dense también
    ("model", Ridge(alpha=1.0, random_state=SEED))
])
ridge.fit(Xtr, ytr)
yhat_val_r = ridge.predict(Xva)
print("[Ridge] val -> RMSE:", rmse(yva, yhat_val_r), "MAE:", mae(yva, yhat_val_r))


[Ridge] val -> RMSE: 321236.9037951847 MAE: 95974.60338721322


In [23]:
# # %% Modelo 2: HistGradientBoosting (árboles rápidos)
# # Mini-grid sobre (learning_rate, max_depth, max_leaf_nodes)
# grid = [
#     dict(learning_rate=0.05, max_depth=None
