# Fusion baseline Model (GOES + Ground)

## Libraries

In [1]:
import numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings; warnings.filterwarnings("ignore")
np.random.seed(42)

# Deep learning
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
tf.random.set_seed(42)

2025-09-11 07:38:43.120238: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-11 07:38:43.125691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757594323.131939  336635 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757594323.134004  336635 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-11 07:38:43.141095: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

## Config

In [2]:
ground_train = Path("../data_processed/ground_train_h6.parquet")
ground_val   = Path("../data_processed/ground_val_h6.parquet")
ground_test  = Path("../data_processed/ground_test_h6.parquet")

sat_features = Path("../data_interim/goes_dataset/goes_features.parquet")  # generado en tu pipeline
target_col   = "y_k_h6"
FREQ         = "10min"
SEQ_LEN      = 12 # 12*10min = 2 horas de contexto

## Data

In [3]:
g_tr = pd.read_parquet(ground_train)
g_va = pd.read_parquet(ground_val)
g_te = pd.read_parquet(ground_test)

for g in (g_tr, g_va, g_te):
    idx = pd.to_datetime(g.index)
    g.index = idx.tz_localize("UTC") if idx.tz is None else idx.tz_convert("UTC")

sat = pd.read_parquet(sat_features)
sat.index = pd.to_datetime(sat.index)
sat.index = sat.index.tz_localize("UTC") if sat.index.tz is None else sat.index.tz_convert("UTC")
sat = sat.select_dtypes(include=[np.number])  # sólo numérico
sat.index = sat.index.round(FREQ)
sat = sat.groupby(sat.index).mean().sort_index()

print("Ground:", g_tr.shape, g_va.shape, g_te.shape, " | Sat:", sat.shape)

Ground: (57789, 41) (12384, 41) (12384, 41)  | Sat: (300, 32)


In [None]:
# print("=== PreDiagnosis ===")
# print("Ground test shape:", g_te.shape)
# print("Sat features shape:", sat.shape)
# print("Ground test range:", g_te.index.min(), "->", g_te.index.max())
# print("Sat features range:", sat.index.min(), "->", sat.index.max())

# # Verifica si las columnas satelitales existen
# sat_cols = [c for c in sat.columns if 'mean' in c or 'std' in c]
# print(f"Columnas satelitales en sat: {len(sat_cols)}")
# print(f"Columnas satelitales en Xte: {len([c for c in Xte.columns if c in sat_cols])}")

# # Muestra las columnas faltantes
# missing_in_test = set(Xtr.columns) - set(Xte.columns)
# print(f"Columnas en train pero no en test: {len(missing_in_test)}")
# if missing_in_test:
#     print("Son:", list(missing_in_test))#[:5])

=== PreDiagnosis ===
Ground test shape: (12384, 41)
Sat features shape: (300, 32)
Ground test range: 2025-01-01 04:00:00+00:00 -> 2025-03-28 03:50:00+00:00
Sat features range: 2024-02-01 00:30:00+00:00 -> 2024-02-10 15:10:00+00:00
Columnas satelitales en sat: 32
Columnas satelitales en Xte: 0
Columnas en train pero no en test: 32
Son: ['C13_std', 'C15_std', 'C06_std', 'C07_mean', 'C01_mean', 'C01_std', 'C09_mean', 'C07_std', 'C12_mean', 'C12_std', 'C05_mean', 'C11_mean', 'C16_mean', 'C13_mean', 'C04_std', 'C14_mean', 'C09_std', 'C03_mean', 'C04_mean', 'C08_mean', 'C16_std', 'C14_std', 'C08_std', 'C10_std', 'C06_mean', 'C05_std', 'C11_std', 'C02_mean', 'C02_std', 'C10_mean', 'C15_mean', 'C03_std']


## Join & cleaning

In [4]:
def prepare_split(gdf, sat_df, target):
    df = gdf.join(sat_df, how="left")
    df = df.dropna(subset=[target])

    # features = todo lo numérico excepto targets
    X = (
        df.drop(columns=[c for c in df.columns if c.startswith("y_") or c == target], errors="ignore")
          .select_dtypes(include=[np.number])
          .astype("float32")
    )
    y = df[target].astype("float32")

    # ffill/bfill por día para reducir huecos de sat
    X = X.groupby(X.index.date).apply(lambda d: d.ffill().bfill()).reset_index(level=0, drop=True)
    # quita columnas vacías
    X = X.dropna(axis=1, how="all")
    # filtra filas con demasiados NaN
    keep = (X.isna().mean(axis=1) <= 0.3)
    X, y = X.loc[keep], y.loc[keep]
    # relleno final mínimo
    X = X.fillna(method="ffill").fillna(method="bfill")

    return X, y

Xtr, ytr = prepare_split(g_tr, sat, target_col)
Xva, yva = prepare_split(g_va, sat, target_col)
Xte, yte = prepare_split(g_te, sat, target_col)

print("Joined — train:", Xtr.shape, "val:", Xva.shape, "test:", Xte.shape)
if min(len(Xtr), len(Xva), len(Xte)) == 0:
    raise RuntimeError("❌ No quedaron filas tras el join/limpieza. Revisa cobertura temporal de sat/ground.")


Joined — train: (1440, 72) val: (12384, 40) test: (12384, 40)


## Baseline (RF)

In [6]:
def rmse(a,b): return mean_squared_error(a,b)

if "k_ghi_lag1" in Xte.columns:
    yhat_base = Xte["k_ghi_lag1"].clip(0, 2.0)
else:
    yhat_base = pd.Series(np.median(ytr), index=yte.index)

print("Baseline → RMSE:", rmse(yte, yhat_base), " MAE:", mean_absolute_error(yte, yhat_base))

rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(Xtr, ytr)
yhat_te_rf = pd.Series(rf.predict(Xte), index=Xte.index)
print("RF Test → RMSE:", rmse(yte, yhat_te_rf), " MAE:", mean_absolute_error(yte, yhat_te_rf), " R2:", r2_score(yte, yhat_te_rf))


Baseline → RMSE: 72215650304.0  MAE: 42850.25390625


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- C01_mean
- C01_std
- C02_mean
- C02_std
- C03_mean
- ...


In [None]:
# sat.isna().sum()

## Sequences

In [None]:
scaler = StandardScaler()
Xtr_s = pd.DataFrame(scaler.fit_transform(Xtr), index=Xtr.index, columns=Xtr.columns)
Xva_s = pd.DataFrame(scaler.transform(Xva), index=Xva.index, columns=Xva.columns)
Xte_s = pd.DataFrame(scaler.transform(Xte), index=Xte.index, columns=Xte.columns)

def build_sequences(X_df, y_ser, seq_len):
    Xv, yv = X_df.values, y_ser.values
    n = len(X_df)
    xs, ys, idxs = [], [], []
    for i in range(seq_len-1, n):
        # asumimos grilla regular 10 min; si quieres, verifica gaps aquí
        block = Xv[i-seq_len+1:i+1]
        if np.isnan(block).any():
            continue
        xs.append(block)
        ys.append(yv[i])
        idxs.append(X_df.index[i])
    return np.array(xs, dtype="float32"), np.array(ys, dtype="float32"), pd.DatetimeIndex(idxs)

Xtr_seq, ytr_seq, i_tr = build_sequences(Xtr_s, ytr, SEQ_LEN)
Xva_seq, yva_seq, i_va = build_sequences(Xva_s, yva, SEQ_LEN)
Xte_seq, yte_seq, i_te = build_sequences(Xte_s, yte, SEQ_LEN)

n_features = Xtr_seq.shape[2]
print("Seq shapes →",
      "Xtr", Xtr_seq.shape, "Xva", Xva_seq.shape, "Xte", Xte_seq.shape, "| features:", n_features)

if min(len(Xtr_seq), len(Xva_seq), len(Xte_seq)) == 0:
    raise RuntimeError("❌ No hay suficientes ventanas para las secuencias. Reduce SEQ_LEN o revisa cobertura.")

# Alinear baseline a las muestras secuenciales del test
ybase_seq = yhat_base.reindex(i_te).to_numpy()

NameError: name 'sat' is not defined

## Models

In [None]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")

def fit_and_eval(model, name, epochs=60, batch=256):
    model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.MAE])
    hist = model.fit(Xtr_seq, ytr_seq,
                     validation_data=(Xva_seq, yva_seq),
                     epochs=epochs, batch_size=batch,
                     verbose=0, callbacks=[es])
    # predicciones
    yhat_va = model.predict(Xva_seq, verbose=0).squeeze()
    yhat_te = model.predict(Xte_seq, verbose=0).squeeze()
    # métricas
    def rmse_np(a,b): return np.sqrt(np.mean((a-b)**2))
    print(f"{name} → Val RMSE: {rmse_np(yva_seq, yhat_va):.4f} | Test RMSE: {rmse_np(yte_seq, yhat_te):.4f} "
          f"| Test MAE: {mean_absolute_error(yte_seq, yhat_te):.4f} | R2: {r2_score(yte_seq, yhat_te):.4f}")
    return hist, yhat_te

Joined shapes — train: (40344, 40) val: (12360, 40) test: (0, 40)


In [None]:
mdl_lstm = models.Sequential([
    layers.Input(shape=(SEQ_LEN, n_features)),
    layers.LSTM(64),
    layers.Dense(1)
])
hist_lstm, yhat_lstm = fit_and_eval(mdl_lstm, "LSTM")

In [None]:
mdl_bilstm = models.Sequential([
    layers.Input(shape=(SEQ_LEN, n_features)),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(1)
])
hist_bilstm, yhat_bilstm = fit_and_eval(mdl_bilstm, "BiLSTM")

In [None]:
mdl_cnnlstm = models.Sequential([
    layers.Input(shape=(SEQ_LEN, n_features)),
    layers.Conv1D(32, kernel_size=3, padding="causal", activation="relu"),
    layers.MaxPooling1D(pool_size=2),
    layers.LSTM(64),
    layers.Dense(1)
])
hist_cnnlstm, yhat_cnnlstm = fit_and_eval(mdl_cnnlstm, "CNN-LSTM")

## Comparison

In [None]:
def plot_series(y_true, preds_dict, n=500, title="Test — Truth vs Models (primeros puntos)"):
    sl = slice(0, min(n, len(y_true)))
    plt.figure(figsize=(10,3))
    plt.plot(y_true[sl], label="truth", lw=1.2)
    for name, yhat in preds_dict.items():
        plt.plot(pd.Series(yhat, index=i_te)[sl], label=name, lw=1.0)
    plt.title(title); plt.grid(True, ls="--", alpha=0.3); plt.legend(); plt.tight_layout(); plt.show()


In [None]:
from math import isfinite
plot_series(pd.Series(yte_seq, index=i_te),
            {"baseline": ybase_seq,
             "RF": yhat_te_rf.reindex(i_te).values,
             "LSTM": yhat_lstm,
             "BiLSTM": yhat_bilstm,
             "CNN-LSTM": yhat_cnnlstm},
            n=500)

In [None]:
def scatter_plot(y_true, y_pred, name):
    plt.figure(figsize=(4.2,4))
    plt.scatter(y_true, y_pred, s=8, alpha=0.4)
    lim = [0, max(2.0, np.nanmax(y_true), np.nanmax(y_pred))]
    plt.plot(lim, lim, "k--", lw=1)
    plt.xlim(lim); plt.ylim(lim)
    plt.xlabel("y_true (k)"); plt.ylabel("y_pred (k)")
    plt.title(f"Scatter Test — {name}")
    plt.grid(True, ls="--", alpha=0.3); plt.tight_layout(); plt.show()

scatter_plot(yte_seq, yhat_cnnlstm, "CNN-LSTM")

In [None]:
# Hist residual (CNN-LSTM)
res = yhat_cnnlstm - yte_seq
plt.figure(figsize=(6,3)); plt.hist(res, bins=40, alpha=0.85)
plt.title("Residuals (y_pred - y_true) — CNN-LSTM (Test)")
plt.xlabel("residual"); plt.ylabel("count")
plt.grid(True, ls="--", alpha=0.3); plt.tight_layout(); plt.show()


In [None]:
# Skill vs Baseline (secuencias)
def rmse_np(a,b): return np.sqrt(np.mean((a-b)**2))
rmse_base = rmse_np(yte_seq, ybase_seq)
for name, yhat in [("RF", yhat_te_rf.reindex(i_te).values),
                   ("LSTM", yhat_lstm),
                   ("BiLSTM", yhat_bilstm),
                   ("CNN-LSTM", yhat_cnnlstm)]:
    s = 1 - (rmse_np(yte_seq, yhat) / rmse_base)
    print(f"Skill (RMSE) vs baseline — {name}: {s:.3f}")