# Ground Station — Data Cleaning & Feature Engineering

In [12]:
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
import pvlib
import matplotlib.pyplot as plt

## Config

In [13]:
PRQ_IN   = Path("../data_interim/ground_features.parquet")
OUT_INT  = Path("../data_interim")
OUT_PROC = Path("../data_processed")
OUT_INT.mkdir(parents=True, exist_ok=True)
OUT_PROC.mkdir(parents=True, exist_ok=True)

In [14]:
@dataclass
class CFG:
    freq: str = "10min"
    lat: float = 4.6043
    lon: float = -74.0659
    alt_m: float = 2624.0
    day_mask_wm2: float = 50.0      # umbral de GHI_cs para considerar “día”
    max_gap_steps: int = 3          # huecos cortos a imputar (3*10min = 30min)
    horizons: tuple[int,...] = (6, 12, 18, 24, 36)  # pasos de 10 min → 1h,2h,...
    target_h: int = 6               # horizonte “principal” (1h)
cfg = CFG()

## Data

In [15]:
assert PRQ_IN.exists(), f"No existe {PRQ_IN}. Asegura que el notebook 01 generó este archivo."
df = pd.read_parquet(PRQ_IN)

if df.index.tz is None:
    df.index = df.index.tz_localize("UTC")
else:
    df.index = df.index.tz_convert("UTC")

df = df.sort_index()

In [16]:
df.head()

Unnamed: 0_level_0,temp_c,hod_sin,hod_cos,doy_sin,doy_cos,zenith,azimuth,u_ms,v_ms,dewpoint_c,...,v_ms_lag2,v_ms_lag3,v_ms_roll_mean_1h,v_ms_roll_std_1h,dewpoint_c_lag1,dewpoint_c_lag2,dewpoint_c_lag3,dewpoint_c_roll_mean_1h,dewpoint_c_roll_std_1h,y_k_h6
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-09-01 05:30:00+00:00,11.872,0.0,1.0,-0.871706,-0.490029,90.0,32.84833,-2.806613,0.632498,7.966626,...,0.947251,1.397169,0.937263,0.332547,8.118482,8.040044,8.070847,8.049,0.063694,0.0
2023-09-01 05:40:00+00:00,11.756,0.0,1.0,-0.871706,-0.490029,90.0,39.975157,-3.024904,0.910623,7.991068,...,0.772134,0.947251,0.931935,0.288241,7.966626,8.118482,8.040044,8.037413,0.060942,0.0
2023-09-01 05:50:00+00:00,11.7,0.0,1.0,-0.871706,-0.490029,90.0,45.881182,-3.201425,0.852551,8.000007,...,0.632498,0.772134,0.918704,0.259839,7.991068,7.966626,8.118482,8.031179,0.056607,0.0
2023-09-01 06:00:00+00:00,11.73,0.258819,0.965926,-0.871706,-0.490029,90.0,50.758573,-2.964916,0.837757,8.020274,...,0.910623,0.632498,0.825469,0.112293,8.000007,7.991068,7.966626,8.02275,0.053181,0.0
2023-09-01 06:10:00+00:00,11.708,0.258819,0.965926,-0.871706,-0.490029,90.0,54.800003,-2.781543,0.606983,8.042253,...,0.852551,0.910623,0.768758,0.12382,8.020274,8.000007,7.991068,8.023118,0.053332,0.0


In [17]:

print("Date range:", df.index.min(), "→", df.index.max())
print("Columns:", len(df.columns))

Date range: 2023-09-01 05:30:00+00:00 → 2025-03-28 03:50:00+00:00
Columns: 41


## Features

In [18]:
# 1) codificación temporal (usa hora local para HOD si quieres estacionalidad local)
hod = df.index.tz_convert("America/Bogota").hour
doy = df.index.dayofyear
df["hod_sin"], df["hod_cos"] = np.sin(2*np.pi*hod/24), np.cos(2*np.pi*hod/24)
df["doy_sin"], df["doy_cos"] = np.sin(2*np.pi*doy/365), np.cos(2*np.pi*doy/365)

# 2) posición solar
solpos = pvlib.solarposition.get_solarposition(df.index, cfg.lat, cfg.lon, cfg.alt_m)
df["zenith"]  = solpos["apparent_zenith"].clip(0,90)
df["azimuth"] = solpos["azimuth"]

# 3) viento U/V
if "wspd_ms" in df and "wdir_deg" in df:
    rad = np.deg2rad(df["wdir_deg"])
    df["u_ms"] = -df["wspd_ms"]*np.sin(rad)
    df["v_ms"] = -df["wspd_ms"]*np.cos(rad)

# 4) punto de rocío y humedad absoluta (aprox)
if "Hr" in df and "temp_c" in df:
    a,b = 17.62, 243.12
    gamma = np.log(df["Hr"].clip(1,100)/100.0) + (a*df["temp_c"])/(b+df["temp_c"])
    df["dewpoint_c"] = (b*gamma)/(a-gamma)
    # Humedad absoluta (g/m^3) — fórmula aproximada
    T = df["temp_c"]
    RH = df["Hr"]
    es = 6.112*np.exp((17.67*T)/(T+243.5))          # hPa
    e  = RH/100.0 * es                               # hPa
    df["abs_humidity_gm3"] = 2.1674*e*100/(273.15+T) # g/m^3

# 5) lags & rolling causales (solo pasado)
def add_lags_roll(base_cols, lags=(1,2,3), roll_steps=6):
    for c in base_cols:
        if c in df:
            for L in lags:
                df[f"{c}_lag{L}"] = df[c].shift(L)
            df[f"{c}_roll_mean_1h"] = df[c].rolling(roll_steps, min_periods=max(2, roll_steps//2)).mean()
            df[f"{c}_roll_std_1h"]  = df[c].rolling(roll_steps, min_periods=max(2, roll_steps//2)).std()

base_for_lags = ["k_ghi","ghi_qc","temp_c","u_ms","v_ms","dewpoint_c","abs_humidity_gm3"]
add_lags_roll(base_for_lags, lags=(1,2,3), roll_steps=6)  # 6*10min=1h

In [19]:
# Targets multi-horizonte
mask_day = df.get("ghi_cs", pd.Series(index=df.index, dtype=float)) > cfg.day_mask_wm2
for h in cfg.horizons:
    if "k_ghi" in df:
        df[f"y_k_h{h}"]   = df["k_ghi"].shift(-h)
    if "ghi_qc" in df:
        df[f"y_ghi_h{h}"] = df["ghi_qc"].shift(-h)

## Selection

In [20]:
feat_prefixes = ("_lag", "_roll_", "hod_", "doy_", "zenith", "azimuth", "u_ms", "v_ms",
                 "dewpoint_c", "abs_humidity_gm3", "temp_c", "Hr", "p_hpa", "wspd_ms", "wdir_deg", "ghi_cs")
feature_cols = [c for c in df.columns if c.startswith(feat_prefixes) or any(p in c for p in feat_prefixes)]
target_col = f"y_ghi_h{cfg.target_h}" if f"y_ghi_h{cfg.target_h}" in df else f"y_k_h{cfg.target_h}"
assert target_col in df, f"No existe {target_col}; revisa que existan 'ghi_qc' o 'k_ghi'."

Xy = df[feature_cols + [target_col]].dropna()

## Split & Export

In [21]:
# %% Split cronológico 70/15/15 (train/val/test)
n = len(Xy)
i1 = int(n*0.70); i2 = int(n*0.85)
train = Xy.iloc[:i1].copy()
val   = Xy.iloc[i1:i2].copy()
test  = Xy.iloc[i2:].copy()

print("Filas:", len(Xy), "| Train:", len(train), "Val:", len(val), "Test:", len(test))
print("Target:", target_col)

# %% Export
OUT_ALL = OUT_INT / "ground_features_v2.parquet"
Xy.to_parquet(OUT_ALL, engine="pyarrow", compression="zstd")

# Export splits (útiles para modelado directo)
(train.dropna()).to_parquet(OUT_PROC / f"ground_train_h{cfg.target_h}.parquet", engine="pyarrow", compression="zstd")
(val.dropna()).to_parquet(OUT_PROC / f"ground_val_h{cfg.target_h}.parquet", engine="pyarrow", compression="zstd")
(test.dropna()).to_parquet(OUT_PROC / f"ground_test_h{cfg.target_h}.parquet", engine="pyarrow", compression="zstd")

print("✅ Guardado:")
print(" -", OUT_ALL)
print(" -", OUT_PROC / f"ground_train_h{cfg.target_h}.parquet")
print(" -", OUT_PROC / f"ground_val_h{cfg.target_h}.parquet")
print(" -", OUT_PROC / f"ground_test_h{cfg.target_h}.parquet")

Filas: 82557 | Train: 57789 Val: 12384 Test: 12384
Target: y_k_h6
✅ Guardado:
 - ../data_interim/ground_features_v2.parquet
 - ../data_processed/ground_train_h6.parquet
 - ../data_processed/ground_val_h6.parquet
 - ../data_processed/ground_test_h6.parquet


## Sanity

In [22]:
corrs = train.corr(numeric_only=True)[target_col].sort_values(ascending=False)
print("Top features correlacionadas con el target:")
print(corrs.drop(labels=[target_col]).head(10))

Top features correlacionadas con el target:
zenith                0.110085
k_ghi_roll_mean_1h    0.098631
hod_cos               0.093785
k_ghi_lag1            0.082287
k_ghi_roll_std_1h     0.077012
k_ghi_lag2            0.062642
k_ghi_lag3            0.045565
v_ms_roll_std_1h      0.023177
u_ms_roll_std_1h      0.011458
azimuth               0.011331
Name: y_k_h6, dtype: float64
