In [2]:
# MODIS L2 OC (group-aware) → lat, lon, timestamp, chl_level
# Escritura incremental a Parquet particionado (year/month)

from pathlib import Path
import re
from datetime import datetime, timezone
import importlib
import numpy as np
import pandas as pd
import xarray as xr
from tqdm.auto import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

# --------------------------
# Ubicar carpetas (origen/destino)
# --------------------------
def find_clorophyll_sample_dir(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    for parent in [start, *start.parents]:
        cand = parent / "downloads" / "clorophyll" / "sample"
        if cand.is_dir(): return cand
    raise FileNotFoundError(f"Could not find 'downloads/clorophyll/sample' from {start}.")

SAMPLE_DIR = find_clorophyll_sample_dir()
REPO_ROOT  = SAMPLE_DIR.parents[2]  # .../downloads/clorophyll/sample -> subir 3 niveles
OUT_PARQ   = (REPO_ROOT / "transform" / "clorophyll" / "sample")
OUT_PARQ.mkdir(parents=True, exist_ok=True)

# --------------------------
# Config
# --------------------------
FRACTION = 1.0
RANDOM_SEED = 42
MAX_POINTS_PER_FILE = None  # e.g., 200_000
COMPRESSION = "snappy"      # snappy|zstd|gzip

# --------------------------
# Utilidades
# --------------------------
def pick_engine():
    if importlib.util.find_spec("netCDF4") is None:
        raise RuntimeError("Install netCDF4:  python -m pip install netCDF4")
    return "netcdf4"

def midpoint_iso(t0, t1):
    def parse_iso(s):
        if not s: return None
        s = str(s).strip().replace("Z","+00:00")
        try: return datetime.fromisoformat(s).astimezone(timezone.utc)
        except Exception: return None
    a, b = parse_iso(t0), parse_iso(t1)
    if a and b: return (a + (b - a)/2).isoformat().replace("+00:00","Z")
    if a: return a.isoformat().replace("+00:00","Z")
    if b: return b.isoformat().replace("+00:00","Z")
    return None

def timestamp_from_name(fname):
    m = re.search(r"(\d{8}T\d{6})", fname)
    if not m: return None
    s = m.group(1)
    return f"{s[:4]}-{s[4:6]}-{s[6:8]}T{s[9:11]}:{s[11:13]}:{s[13:]}Z"

def list_variable_paths(nc_path: Path):
    import netCDF4 as nc
    var_paths = []
    with nc.Dataset(nc_path, "r") as ds:
        def walk(group, prefix=""):
            for vname in group.variables.keys():
                var_paths.append(f"{prefix}{vname}" if prefix else vname)
            for gname, subg in group.groups.items():
                walk(subg, f"{prefix}{gname}/" if prefix else f"{gname}/")
        walk(ds)
    return var_paths

def split_group_var(path_str: str):
    if "/" not in path_str: return None, path_str
    parts = path_str.split("/")
    return "/".join(parts[:-1]), parts[-1]

def choose_chl_path(var_paths):
    preferred = {"chlor_a","chlorophyll_a","chlorophyll"}
    for p in var_paths:
        if p.split("/")[-1] in preferred: return p
    for p in var_paths:
        base = p.split("/")[-1].lower()
        if "chlor" in base or re.search(r"\bchl\b", base): return p
    return None

def choose_lat_lon_paths(var_paths, chl_shape, open_group_func):
    lat_names = {"latitude","lat"}
    lon_names = {"longitude","lon"}

    # 2D exact match
    for p in var_paths:
        if p.split("/")[-1].lower() in lat_names:
            g_lat, v_lat = split_group_var(p)
            try: arr_lat = np.asarray(open_group_func(g_lat)[v_lat].values)
            except Exception: continue
            if arr_lat.shape == chl_shape:
                for q in var_paths:
                    if q.split("/")[-1].lower() in lon_names:
                        g_lon, v_lon = split_group_var(q)
                        try: arr_lon = np.asarray(open_group_func(g_lon)[v_lon].values)
                        except Exception: continue
                        if arr_lon.shape == chl_shape: return p, q, "2D"

    # 1D meshgrid
    if len(chl_shape) == 2:
        ny, nx = chl_shape
        lat1d, lon1d = [], []
        for p in var_paths:
            b = p.split("/")[-1].lower()
            if b in lat_names:
                g,v = split_group_var(p)
                try: arr = np.asarray(open_group_func(g)[v].values)
                except Exception: continue
                if arr.ndim == 1 and arr.size == ny: lat1d.append(p)
            if b in lon_names:
                g,v = split_group_var(p)
                try: arr = np.asarray(open_group_func(g)[v].values)
                except Exception: continue
                if arr.ndim == 1 and arr.size == nx: lon1d.append(p)
        if lat1d and lon1d: return lat1d[0], lon1d[0], "1D"

    return None, None, None

def flatten_points(lat_arr, lon_arr, chl_arr):
    lat_f, lon_f, chl_f = lat_arr.ravel(), lon_arr.ravel(), chl_arr.ravel()
    finite = np.isfinite(lat_f) & np.isfinite(lon_f) & np.isfinite(chl_f)
    return lat_f[finite], lon_f[finite], chl_f[finite]

# --------------------------
# Proceso (escritura incremental)
# --------------------------
nc_files = sorted(SAMPLE_DIR.glob("*.nc"))
assert nc_files, f"No .nc files found in {SAMPLE_DIR}"
engine = pick_engine()
rng = np.random.default_rng(RANDOM_SEED)

skipped = []
print(f"📂 Origen:  {SAMPLE_DIR}")
print(f"🧺 Parquet: {OUT_PARQ} (partitioned by year/month)")
print(f"🔌 Engine:  {engine}")
print(f"🗂️ Files:   {len(nc_files)}")

for path in tqdm(nc_files, desc="Processing", unit="file"):
    # 1) Descubrir variables en todos los grupos
    try:
        var_paths = list_variable_paths(path)
    except Exception as e:
        skipped.append((path.name, f"list vars failed: {e}"))
        continue

    def open_group(group_name):
        if group_name in (None, "", "/"):
            return xr.open_dataset(path, engine=engine, decode_cf=True, mask_and_scale=True)
        return xr.open_dataset(path, engine=engine, decode_cf=True, mask_and_scale=True, group=group_name)

    # 2) Timestamp del archivo
    try:
        root_ds = open_group(None)
        ts_iso = midpoint_iso(root_ds.attrs.get("time_coverage_start"),
                              root_ds.attrs.get("time_coverage_end")) \
                 or timestamp_from_name(path.name)
    except Exception:
        ts_iso = timestamp_from_name(path.name)
    if ts_iso is None:
        skipped.append((path.name, "no timestamp available"))
        try: root_ds.close()
        except: pass
        continue

    # 3) Clorofila
    chl_path = choose_chl_path(var_paths)
    if chl_path is None:
        skipped.append((path.name, "no chlorophyll-like var"))
        try: root_ds.close()
        except: pass
        continue

    g_chl, v_chl = split_group_var(chl_path)
    try:
        ds_chl = open_group(g_chl)
        chl_arr = np.asarray(ds_chl[v_chl].values)
        chl_shape = chl_arr.shape
    except Exception as e:
        skipped.append((path.name, f"read chl failed ({chl_path}): {e}"))
        for dso in ("ds_chl","root_ds"):
            try: locals().get(dso).close()
            except: pass
        continue

    # 4) Geo
    lat_path, lon_path, geo_kind = choose_lat_lon_paths(var_paths, chl_shape, open_group)
    if lat_path is None or lon_path is None:
        skipped.append((path.name, "no lat/lon compatible"))
        for dso in ("ds_chl","root_ds"):
            try: locals().get(dso).close()
            except: pass
        continue

    try:
        g_lat, v_lat = split_group_var(lat_path)
        g_lon, v_lon = split_group_var(lon_path)
        ds_lat = open_group(g_lat); ds_lon = open_group(g_lon)
        lat_val = np.asarray(ds_lat[v_lat].values)
        lon_val = np.asarray(ds_lon[v_lon].values)
        if geo_kind == "1D":
            ny, nx = chl_arr.shape[-2], chl_arr.shape[-1]
            lon_val, lat_val = np.meshgrid(lon_val, lat_val)
    except Exception as e:
        skipped.append((path.name, f"read lat/lon failed: {e}"))
        for dso in ("ds_chl","ds_lat","ds_lon","root_ds"):
            try: locals().get(dso).close()
            except: pass
        continue

    for dso in (ds_chl, ds_lat, ds_lon, root_ds):
        try: dso.close()
        except: pass

    # 5) Aplanar, filtrar, submuestrear
    lat_f, lon_f, chl_f = flatten_points(lat_val, lon_val, chl_arr)
    if lat_f.size == 0:
        skipped.append((path.name, "no finite lat/lon/chl"))
        continue

    if FRACTION < 1.0:
        k = int(np.ceil(lat_f.size * FRACTION))
        idx = rng.choice(lat_f.size, size=k, replace=False)
        lat_f, lon_f, chl_f = lat_f[idx], lon_f[idx], chl_f[idx]

    if (MAX_POINTS_PER_FILE is not None) and (lat_f.size > MAX_POINTS_PER_FILE):
        idx = rng.choice(lat_f.size, size=MAX_POINTS_PER_FILE, replace=False)
        lat_f, lon_f, chl_f = lat_f[idx], lon_f[idx], chl_f[idx]

    # 6) ==== ESCRITURA INCREMENTAL A PARQUET ====
    block = pd.DataFrame({
        "lat": lat_f,
        "lon": lon_f,
        "timestamp": ts_iso,
        "chl_level": chl_f,
    })
    block["ts"] = pd.to_datetime(block["timestamp"], utc=True, errors="coerce")
    block["year"] = block["ts"].dt.year.astype("int16")
    block["month"] = block["ts"].dt.month.astype("int8")

    table = pa.Table.from_pandas(block[["lat","lon","timestamp","chl_level","year","month"]],
                                 preserve_index=False)
    pq.write_to_dataset(table,
                        root_path=OUT_PARQ,
                        partition_cols=["year","month"],
                        compression=COMPRESSION)

# Reporte
print(f"\n✅ Parquet dataset listo en: {OUT_PARQ}")
if skipped:
    print("\n⚠️ Skipped files:")
    for name, reason in skipped:
        print(f"  - {name}: {reason}")


📂 Origen:  C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\downloads\clorophyll\sample
🧺 Parquet: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\clorophyll\sample (partitioned by year/month)
🔌 Engine:  netcdf4
🗂️ Files:   17


Processing:   0%|          | 0/17 [00:00<?, ?file/s]


✅ Parquet dataset listo en: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\clorophyll\sample
