# Fusion baseline Model (GOES + Ground)

## Libraries

In [2]:
import numpy as np, pandas as pd
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Config

In [3]:
ground_train = Path("../data_processed/ground_train_h6.parquet")
ground_val   = Path("../data_processed/ground_val_h6.parquet")
ground_test  = Path("../data_processed/ground_test_h6.parquet")
sat_pickles  = sorted(glob("../data_interim/goes_demo/datos_sateli.pkl", recursive=True))  # p.ej. goes_demo/datos_sateli.pkl
target_col   = "y_k_h6"
resample_rule = "10min"
channels = list(range(1,17))

## Data

In [4]:
g_tr = pd.read_parquet(ground_train)
g_va = pd.read_parquet(ground_val)
g_te = pd.read_parquet(ground_test)

# Asegurar índice UTC tz-aware
for g in (g_tr, g_va, g_te):
    idx = pd.to_datetime(g.index)
    g.index = idx.tz_localize("UTC") if idx.tz is None else idx.tz_convert("UTC")

print("Ground shapes:", g_tr.shape, g_va.shape, g_te.shape)

Ground shapes: (57789, 41) (12384, 41) (12384, 41)


In [None]:
sat_raw = pd.read_pickle("../data_interim/goes_demo/datos_sateli.pkl")
chan_cols = [c for c in sat_raw.columns if c.isdigit()] 
sat_feat = pd.DataFrame(index=sat_raw.index)

2024-01-01 00:00:00+00:00
2024-01-01 00:10:00+00:00
2024-01-01 00:20:00+00:00
2024-01-01 00:30:00+00:00
2024-01-01 00:40:00+00:00


In [18]:
for c in chan_cols:
    sat_feat[f"C{int(c):02d}_mean"] = sat_raw[c].apply(lambda a: np.nanmean(a) if isinstance(a, np.ndarray) else np.nan)
    sat_feat[f"C{int(c):02d}_std"]  = sat_raw[c].apply(lambda a: np.nanstd(a)  if isinstance(a, np.ndarray) else np.nan)
sat_feat.head()

Unnamed: 0,C01_mean,C01_std,C02_mean,C02_std,C03_mean,C03_std,C04_mean,C04_std,C05_mean,C05_std,...,C12_mean,C12_std,C13_mean,C13_std,C14_mean,C14_std,C15_mean,C15_std,C16_mean,C16_std
2024-01-01 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2024-01-01 00:10:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2024-01-01 00:20:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2024-01-01 00:30:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2024-01-01 00:40:00+00:00,,,,,,,,,,,...,,,,,,,,,,


In [12]:
sat.isna().sum()

1     52704
2     52704
3     52704
4     52704
5     52704
6     52704
7     52704
8     52704
9     52704
10    52704
11    52704
12    52704
13    52704
14    52704
15    52704
16    52704
dtype: int64

# Processing

In [6]:
rows, times = [], []
for ts, row in sat.iterrows():
    out = {}
    for c in channels:
        key = f"{c}"
        if key in row and isinstance(row[key], (np.ndarray, list)):
            a = np.asarray(row[key], dtype="float32")
            if a.ndim != 2: 
                continue
            m = float(np.nanmean(a))
            s = float(np.nanstd(a))
            p10 = float(np.nanpercentile(a, 10))
            p50 = float(np.nanpercentile(a, 50))
            p90 = float(np.nanpercentile(a, 90))
            gx, gy = np.gradient(np.nan_to_num(a, nan=m))
            gmean = float(np.mean(np.hypot(gx, gy)))
            out.update({
                f"C{c:02d}_mean": m, f"C{c:02d}_std": s,
                f"C{c:02d}_p10": p10, f"C{c:02d}_p50": p50, f"C{c:02d}_p90": p90,
                f"C{c:02d}_gmean": gmean
            })
    rows.append(out); times.append(ts)

sat_feat = pd.DataFrame(rows, index=pd.DatetimeIndex(times, tz="UTC")).sort_index()
sat_feat = sat_feat.resample(resample_rule).mean()
print("Sat features shape:", sat_feat.shape)

Sat features shape: (52704, 0)


In [7]:
# Train
df_tr = g_tr.join(sat_feat, how="inner")
df_tr = df_tr.dropna(subset=[target_col])
ytr = df_tr[target_col].astype("float32")
Xtr = df_tr.drop(columns=[c for c in df_tr.columns if c.startswith("y_")]).select_dtypes(include=[np.number]).astype("float32")

# Val
df_va = g_va.join(sat_feat, how="inner")
df_va = df_va.dropna(subset=[target_col])
yva = df_va[target_col].astype("float32")
Xva = df_va.drop(columns=[c for c in df_va.columns if c.startswith("y_")]).select_dtypes(include=[np.number]).astype("float32")

# Test
df_te = g_te.join(sat_feat, how="inner")
df_te = df_te.dropna(subset=[target_col])
yte = df_te[target_col].astype("float32")
Xte = df_te.drop(columns=[c for c in df_te.columns if c.startswith("y_")]).select_dtypes(include=[np.number]).astype("float32")

print("Joined shapes —",
      "train:", Xtr.shape, "val:", Xva.shape, "test:", Xte.shape)

Joined shapes — train: (40344, 40) val: (12360, 40) test: (0, 40)


## Baseline

In [9]:
if "k_ghi_lag1" in Xte.columns:
    yhat_base = Xte["k_ghi_lag1"].clip(0, 2.0)
else:
    yhat_base = pd.Series(np.median(ytr), index=yte.index)

rmse = lambda a,b: mean_squared_error(a,b)
print("Baseline  RMSE:", rmse(yte, yhat_base), " MAE:", mean_absolute_error(yte, yhat_base))

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
# (Random Forest simple)
rf = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)
rf.fit(Xtr, ytr)

yhat_va = pd.Series(rf.predict(Xva), index=Xva.index)
yhat_te = pd.Series(rf.predict(Xte), index=Xte.index)

print("Val   RMSE:", rmse(yva, yhat_va), " MAE:", mean_absolute_error(yva, yhat_va), " R2:", r2_score(yva, yhat_va))
print("Test  RMSE:", rmse(yte, yhat_te), " MAE:", mean_absolute_error(yte, yhat_te), " R2:", r2_score(yte, yhat_te))

skill = 1 - (rmse(yte, yhat_te) / rmse(yte, yhat_base))
print(f"Skill vs baseline (RMSE): {skill:.3f}")

## Plots

In [None]:
win = slice(0, min(400, len(yte)))
plt.figure(figsize=(10,3))
plt.plot(yte.iloc[win], label="truth", lw=1.2)
plt.plot(yhat_te.iloc[win], label="model", lw=1.0)
plt.plot(yhat_base.iloc[win], label="baseline", lw=1.0, alpha=0.7)
plt.title("Test — truth vs model vs baseline")
plt.grid(True, ls="--", alpha=0.3); plt.legend(); plt.tight_layout(); plt.show()

np.random.seed(0)
feat_names = Xtr.columns.tolist()
base_rmse = rmse(yte, yhat_te)
imp = []
sub = slice(0, min(3000, len(Xte)))
for col in feat_names:
    Xp = Xte.copy()
    Xp[col] = np.random.permutation(Xp[col].values)
    imp.append(rmse(yte.iloc[sub], model.predict(Xp)[sub]) - base_rmse)
imp = np.array(imp)
top = np.argsort(imp)[::-1][:15]

plt.figure(figsize=(8,5))
plt.barh([feat_names[i] for i in top][::-1], imp[top][::-1])
plt.title("Permutation importance (ΔRMSE) — top 15")
plt.tight_layout(); plt.show()


### Importance

In [None]:
imp = pd.Series(rf.feature_importances_, index=Xtr.columns).sort_values(ascending=False).head(20)
imp.plot(kind="barh", figsize=(6,6)); plt.gca().invert_yaxis()
plt.title("Feature importance (RF) — top 20"); plt.tight_layout(); plt.show()

## Export

In [None]:
# out_dir = Path("../data_interim/fusion_ml")
# out_dir.mkdir(parents=True, exist_ok=True)
# pd.DataFrame({"y_true": yte, "y_model": yhat_te, "y_base": yhat_base}).to_parquet(out_dir/"test_preds.parquet")
# Xtr.columns.to_series().to_csv(out_dir/"feature_columns.csv", index=False)
# print("Saved outputs in:", out_dir)