In [9]:
# MODIS L3b PAR (.x.nc) -> lat, lon, timestamp, light
# - Lee OBPG L3b dentro de /level-3_binned_data: BinList (compuesto), par (compuesto), BinIndex
# - Calcula media por bin: mean = sum / weights (si weights > 0; si no, asume mean ya provista)
# - Convierte bin_num -> (lat, lon) usando rejilla ISIN (NROWS=4320 ó 2160, deducido de attrs)
# - timestamp: punto medio entre time_coverage_start y time_coverage_end
# - Salida: Parquet incremental particionado por year/month
#
# Requiere: pip install h5py numpy pandas pyarrow tqdm

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import h5py

# ==========================
# Localización I/O
# ==========================
def find_light_sample_dir(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    for parent in [start, *start.parents]:
        cand = parent / "downloads" / "light" / "sample"
        if cand.is_dir():
            return cand
    raise FileNotFoundError(f"Could not find 'downloads/light/sample' starting from {start}")

SAMPLE_DIR = find_light_sample_dir()
REPO_ROOT  = SAMPLE_DIR.parents[2]
OUT_PARQ   = (REPO_ROOT / "transform" / "light" / "sample")
OUT_PARQ.mkdir(parents=True, exist_ok=True)

print(f"📂 Input:  {SAMPLE_DIR}")
print(f"🧺 Output Parquet (partitioned): {OUT_PARQ}")

# ==========================
# Configuración
# ==========================
COMPRESSION = "snappy"
CHUNK_ROWS = 2_000_000

# (Opcional) Recorte por bounding box (SW/NE). Déjalo en None para no filtrar.
# BBOX = (12.00284, -81.99937, 14.98947, -79.83031)
BBOX = None

# ==========================
# Tiempo (corregido)
# ==========================
def parse_utc_iso8601(s: str) -> datetime:
    s = (s or "").strip().replace("Z", "+00:00")
    return datetime.fromisoformat(s).astimezone(timezone.utc)

def midpoint_time(attrs: dict) -> pd.Timestamp:
    t0 = parse_utc_iso8601(attrs.get("time_coverage_start", ""))
    t1 = parse_utc_iso8601(attrs.get("time_coverage_end", ""))
    if isinstance(t0, datetime) and isinstance(t1, datetime):
        return pd.Timestamp(t0 + (t1 - t0) / 2)  # no mezclar tz=
    if isinstance(t0, datetime):
        return pd.Timestamp(t0)
    return pd.NaT

def to_utc_series(s: pd.Series) -> pd.Series:
    s = pd.to_datetime(s, errors="coerce")  # sin utc=True si ya trae tz
    if s.dt.tz is None:
        return s.dt.tz_localize("UTC")
    else:
        return s.dt.tz_convert("UTC")

# ==========================
# Grilla ISIN
# ==========================
def detect_nrows_from_attrs(attrs: dict) -> int:
    #  ~9.27 km -> 2160; ~4.64 km -> 4320
    res_km = None
    for key in ("geospatial_lat_resolution", "geospatial_lon_resolution", "spatialResolution"):
        v = attrs.get(key)
        if v:
            txt = str(v).lower().replace("kilometers", "km").replace("kilometres", "km")
            for token in txt.split():
                try:
                    res_km = float(token)
                    break
                except ValueError:
                    continue
        if res_km:
            break
    if res_km is None:
        return 4320
    return 4320 if res_km < 7 else 2160

_row_cache: dict[int, tuple[np.ndarray,np.ndarray,np.ndarray]] = {}

def build_row_index(NROWS: int):
    row_lat = np.empty(NROWS, dtype=np.float64)
    nbin_row = np.empty(NROWS, dtype=np.int64)
    basebin = np.empty(NROWS+1, dtype=np.int64)
    basebin[0] = 1
    for i in range(NROWS):
        lat = 90.0 - 180.0 * (i + 0.5) / NROWS
        row_lat[i] = lat
        nbin = int(round(2 * NROWS * math.cos(math.radians(lat))))
        if nbin < 1:
            nbin = 1
        nbin_row[i] = nbin
        basebin[i+1] = basebin[i] + nbin
    return row_lat, nbin_row, basebin

def bin_to_latlon(bin_nums: np.ndarray, NROWS: int) -> tuple[np.ndarray, np.ndarray]:
    if NROWS not in _row_cache:
        _row_cache[NROWS] = build_row_index(NROWS)
    row_lat, nbin_row, basebin = _row_cache[NROWS]
    idx = np.searchsorted(basebin, bin_nums, side="right") - 1
    idx = np.clip(idx, 0, NROWS-1)
    col = bin_nums - basebin[idx] - 1
    lat = row_lat[idx]
    lon = -180.0 + 360.0 * (col + 0.5) / nbin_row[idx]
    return lat.astype(np.float32), lon.astype(np.float32)

# ==========================
# Escritura Parquet
# ==========================
def write_parquet_block(df: pd.DataFrame):
    df = df.copy()
    df["ts"] = to_utc_series(df["timestamp"])
    df["year"] = df["ts"].dt.year.astype("int16")
    df["month"] = df["ts"].dt.month.astype("int8")
    table = pa.Table.from_pandas(df[["lat","lon","timestamp","light","year","month"]], preserve_index=False)
    pq.write_to_dataset(table, root_path=str(OUT_PARQ), partition_cols=["year","month"], compression=COMPRESSION)

# ==========================
# Utilidades HDF5
# ==========================
def attrs_collect(f: h5py.File) -> dict:
    """Toma attrs de raíz y, si faltan, intenta leer del grupo /processing_control/input_parameters."""
    out = {k: f.attrs[k] for k in f.attrs.keys()}
    # Intenta completar algunos attrs frecuentes
    try:
        g = f["/processing_control/input_parameters"]
        for k in ("geospatial_lat_resolution", "geospatial_lon_resolution", "spatialResolution",
                  "time_coverage_start", "time_coverage_end"):
            if k not in out and k in g.attrs:
                out[k] = g.attrs[k]
    except KeyError:
        pass
    return {k: (v.decode() if isinstance(v, (bytes, np.bytes_)) else v) for k, v in out.items()}

def get_group_level3(f: h5py.File) -> h5py.Group | None:
    for key in ("/level-3_binned_data", "/level-3 Binned Data", "/L3b", "/Level-3 Binned Data"):
        if key in f:
            obj = f[key]
            if isinstance(obj, h5py.Group):
                return obj
    # fallback: buscar por nombre aproximado
    for name, obj in f.items():
        if isinstance(obj, h5py.Group) and "binned" in name.lower():
            return obj
    return None

# ==========================
# Procesamiento principal
# ==========================
files = sorted(SAMPLE_DIR.glob("*.nc"))
assert files, f"No .nc files found in {SAMPLE_DIR}"

skipped = []

for path in tqdm(files, desc="Files", unit="file"):
    try:
        with h5py.File(path, "r") as f:
            attrs = attrs_collect(f)
            ts_mid = midpoint_time(attrs)
            if not isinstance(ts_mid, pd.Timestamp) or pd.isna(ts_mid):
                skipped.append((path.name, "missing/invalid time_coverage_*"))
                continue

            g = get_group_level3(f)
            if g is None:
                skipped.append((path.name, "group '/level-3_binned_data' not found"))
                continue

            # Variables esperadas
            if "BinList" not in g or "par" not in g:
                skipped.append((path.name, "BinList/par not found in level-3_binned_data"))
                continue

            binlist = g["BinList"]
            par     = g["par"]

            # Lectura de campos compuestos
            # BinList: ('bin_num','nobs','nscenes','weights','time_rec')
            if binlist.dtype.names is None or par.dtype.names is None or "sum" not in par.dtype.names:
                skipped.append((path.name, "Unexpected compound dtypes in BinList/par"))
                continue

            bin_num = np.array(binlist["bin_num"][:], dtype=np.int64)
            # preferimos weights si existe; si no, nobs como aproximación
            if "weights" in binlist.dtype.names:
                weights = np.array(binlist["weights"][:], dtype=np.float64)
            elif "nobs" in binlist.dtype.names:
                weights = np.array(binlist["nobs"][:], dtype=np.float64)
            else:
                weights = np.ones(bin_num.shape[0], dtype=np.float64)

            par_sum = np.array(par["sum"][:], dtype=np.float64)
            if par_sum.shape[0] != bin_num.shape[0] or weights.shape[0] != bin_num.shape[0]:
                skipped.append((path.name, f"shape mismatch: par={par_sum.shape} weights={weights.shape} bin_num={bin_num.shape}"))
                continue

            # Media por bin
            with np.errstate(invalid="ignore", divide="ignore"):
                mean_par = par_sum / np.where(weights > 0, weights, np.nan)
            valid = np.isfinite(mean_par)
            if not np.any(valid):
                skipped.append((path.name, "no valid PAR values after mean computation"))
                continue

            bin_sel = bin_num[valid]
            par_sel = mean_par[valid].astype(np.float32)

            # Coordenadas (ISIN)
            # Detecta NROWS (4320 típico 4.6km)
            NROWS = detect_nrows_from_attrs(attrs)
            lat, lon = bin_to_latlon(bin_sel, NROWS)

            if BBOX is not None:
                s, w, n, e = BBOX
                m = (lat >= s) & (lat <= n) & (lon >= w) & (lon <= e)
                if not np.any(m):
                    skipped.append((path.name, "no rows after BBOX filter"))
                    continue
                lat, lon, par_sel = lat[m], lon[m], par_sel[m]

            nrows = lat.size
            ts_arr = np.repeat(pd.Timestamp(ts_mid), nrows)
            df = pd.DataFrame({
                "lat": lat,
                "lon": lon,
                "timestamp": ts_arr,   # tz-aware; to_utc_series maneja la tz
                "light": par_sel,      # einstein m^-2 day^-1
            })

            # Escritura
            if CHUNK_ROWS and nrows > CHUNK_ROWS:
                for i0 in range(0, nrows, CHUNK_ROWS):
                    i1 = min(i0 + CHUNK_ROWS, nrows)
                    write_parquet_block(df.iloc[i0:i1])
            else:
                write_parquet_block(df)

    except Exception as e:
        skipped.append((path.name, f"open/process failed: {e}"))

print(f"\n✅ Parquet dataset written to: {OUT_PARQ}")
if skipped:
    print("\n⚠️ Skipped:")
    for name, reason in skipped:
        print(f"  - {name}: {reason}")


📂 Input:  C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\downloads\light\sample
🧺 Output Parquet (partitioned): C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\light\sample


Files:   0%|          | 0/14 [00:00<?, ?file/s]


✅ Parquet dataset written to: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\light\sample
