# Ground Station — Data Cleaning & Feature Engineering

In [9]:
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
import pvlib
import matplotlib.pyplot as plt

## Config

In [10]:
PRQ_IN   = Path("../data_interim/ground_features.parquet")
OUT_INT  = Path("../data_interim")
OUT_PROC = Path("../data_processed")
OUT_INT.mkdir(parents=True, exist_ok=True)
OUT_PROC.mkdir(parents=True, exist_ok=True)

In [None]:
FREQ        = "10min"
LAT, LON, ALT_M = 4.6043, -74.0659, 2624.0
DAY_WM2     = 50.0           # umbral “día” para k

# TARGET_BASE = "ghi_qc"  
H_LIST = [1, 3, 6, 9]

## Data

In [None]:
df = pd.read_parquet(PRQ_IN).sort_index()

if df.index.tz is None:
    df.index = df.index.tz_localize("UTC")
else:
    df.index = df.index.tz_convert("UTC")

print("Date range:", df.index.min(), "→", df.index.max())
print("Columns:", len(df.columns))

In [13]:
df.head()

Unnamed: 0_level_0,Hr,p_hpa,wdir_deg,temp_c,wspd_ms,ghi_cs,hod_sin,hod_cos,doy_sin,doy_cos,...,y_ghi_h18,y_ghi_sg_h18,y_k_raw_h24,y_k_h24,y_ghi_h24,y_ghi_sg_h24,y_k_raw_h36,y_k_h36,y_ghi_h36,y_ghi_sg_h36
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-09-01 05:00:00+00:00,74.391998,745.236023,135.766006,12.505,1.95,0.0,0.0,1.0,-0.871706,-0.490029,...,0.0,0.0,,10371.428711,0.0,0.010371,,3.157978,13.117,13.100971
2023-09-01 05:10:00+00:00,76.579002,745.085999,112.135002,12.032,2.514,0.0,0.0,1.0,-0.871706,-0.490029,...,0.0,0.0,,29085.714844,0.038,0.029086,,0.846947,21.969999,21.708057
2023-09-01 05:20:00+00:00,75.459999,744.940002,111.295998,12.337,2.126,0.0,0.0,1.0,-0.871706,-0.490029,...,0.0,0.0,,28085.714844,0.031,0.028086,0.477264,0.477013,29.533001,29.517487
2023-09-01 05:30:00+00:00,77.004997,744.969971,102.699997,11.872,2.877,0.0,0.0,1.0,-0.871706,-0.490029,...,0.0,0.0,,-9857.142578,0.0,-0.009857,0.344864,0.355321,36.602001,37.71183
2023-09-01 05:40:00+00:00,77.724998,745.02002,106.753998,11.756,3.159,0.0,0.0,1.0,-0.871706,-0.490029,...,0.0,0.0,,63171.429688,0.0,0.063171,0.285757,0.26568,44.153999,41.051743


In [14]:
print("Date range:", df.index.min(), "→", df.index.max())
print("Columns:", len(df.columns))

Date range: 2023-09-01 05:00:00+00:00 → 2025-03-28 04:50:00+00:00
Columns: 65


## Multi-horizon export

In [None]:
for H in H_LIST:
    print(f"\n==============================================")
    print(f"=== Procesando horizonte H={H} (=> +{H*10} min) ===")
    print(f"==============================================")

    # ---- Target detection para este horizonte ----
    candidatos = [f"y_ghi_h{H}", f"y_k_h{H}", f"y_ghi_sg_h{H}"]
    targets_existentes = [c for c in df.columns if c.startswith("y_")]

    TARGET = None
    # Preferencia: y_ghi_hH > y_k_hH > y_ghi_sg_hH
    for c in candidatos:
        if c in df.columns:
            TARGET = c
            break

    # Fallback: cualquier y_*_hH
    if TARGET is None:
        posibles_h = [c for c in targets_existentes if c.endswith(f"_h{H}")]
        if posibles_h:
            TARGET = posibles_h[0]

    if TARGET is None:
        print(f"  → No encontré ningún target y_* para h={H}, se omite.")
        continue

    print("  Usando TARGET:", TARGET)

    # ---- Features numéricas sin ningún y_* (evita fuga de info) ----
    all_numeric = df.select_dtypes(include=[np.number])
    y_cols = [c for c in all_numeric.columns if c.startswith("y_")]
    FEATS = [c for c in all_numeric.columns if c not in y_cols]

    # dataset Xy = Features + Target
    Xy = all_numeric[FEATS + [TARGET]].replace([np.inf, -np.inf], np.nan)
    Xy = Xy.dropna(subset=[TARGET])         # asegurar target
    Xy = Xy.dropna(how="any")               # filas completas 

    # columnas constantes
    const_cols = [c for c in FEATS if Xy[c].nunique(dropna=True) <= 1]
    if const_cols:
        Xy = Xy.drop(columns=const_cols)
        FEATS = [c for c in FEATS if c not in const_cols]
        print("  Quité columnas constantes:", const_cols)

    print(f"  Filas tras limpieza: {len(Xy)} | #Features: {len(FEATS)} | Target: {TARGET}")

    # ### Split 70/15/15 cronológico

    # %%
    n = len(Xy)
    i1 = int(n * 0.70)
    i2 = int(n * 0.85)
    train = Xy.iloc[:i1].copy()
    val   = Xy.iloc[i1:i2].copy()
    test  = Xy.iloc[i2:].copy()

    # Export splits específicos por horizonte
    train_out = OUT_PROC / f"ground_train_h{H}.parquet"
    val_out   = OUT_PROC / f"ground_val_h{H}.parquet"
    test_out  = OUT_PROC / f"ground_test_h{H}.parquet"

    train.to_parquet(train_out, engine="pyarrow", compression="zstd")
    val.to_parquet  (val_out,   engine="pyarrow", compression="zstd")
    test.to_parquet (test_out,  engine="pyarrow", compression="zstd")

    print(f"  Guardado splits para {TARGET} → h{H}")
    print("    train:", train.shape, "→", train_out)
    print("    val  :", val.shape,   "→", val_out)
    print("    test :", test.shape,  "→", test_out)


    # ### Sanity check

    # %%
    corrs = train[FEATS + [TARGET]].corr(
        method="spearman", numeric_only=True
    )[TARGET].drop(labels=[TARGET]).sort_values(key=np.abs, ascending=False)

    print(f"\n  Top 10 features correlacionadas con {TARGET} (Spearman):")
    print(corrs.head(10))

    # Plot 
    topN = min(20, len(corrs))
    plt.figure(figsize=(8, max(4, 0.3 * topN)))
    plt.barh(corrs.index[:topN][::-1], corrs.values[:topN][::-1])
    plt.title(f"Top {topN} correlaciones (Spearman) con {TARGET} — h={H}")
    plt.xlabel("ρ_s")
    plt.grid(True, ls="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

Usando TARGET: y_ghi_h6
