In [2]:
# ASTER SKT GeoTIFF (.tif) -> lat, lon, timestamp, sst
# Escritura incremental a Parquet particionado (year/month)

from pathlib import Path
import re
import importlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

# =========================
# Localización carpetas
# =========================
def find_sst_sample_dir(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    for parent in [start, *start.parents]:
        cand = parent / "downloads" / "sst" / "sample"
        if cand.is_dir():
            return cand
    raise FileNotFoundError(f"No se encontró 'downloads/sst/sample' desde {start}")

SAMPLE_DIR = find_sst_sample_dir()
REPO_ROOT  = SAMPLE_DIR.parents[2]  # .../downloads/sst/sample -> subir 3 niveles
OUT_PARQ   = (REPO_ROOT / "transform" / "sst" / "sample")
OUT_PARQ.mkdir(parents=True, exist_ok=True)

print(f"📂 Origen:  {SAMPLE_DIR}")
print(f"🧺 Parquet: {OUT_PARQ} (partitioned by year/month)")

# =========================
# Configuración
# =========================
# Submuestreo y límites (para memoria/volumen)
FRACTION = 1.0            # 0<frac<=1 (ej. 0.2 para 20%)
RANDOM_SEED = 42
MAX_POINTS_PER_FILE = None  # p.ej. 2_000_000 para cap por archivo
CHUNK_ROWS = 2_000_000      # escritura por bloques si excede

# Unidades/escala de SST (no asumimos nada por defecto)
SST_UNITS_MODE = "raw"      # "raw" | "kelvin_from_tags" | "celsius_from_tags"
# Si el GeoTIFF trae tags 'scale_factor'/'add_offset', se usarán en *_from_tags

# Mascara QA opcional (si existen *_QA_DataPlane*.tif); por defecto no se aplica
USE_QA_MASK = False
QA_GOOD_VALUE = 1           # si usas máscara binaria simple, 1 = bueno (ajusta si fuese distinto)

rng = np.random.default_rng(RANDOM_SEED)

# =========================
# Dependencias
# =========================
if importlib.util.find_spec("rasterio") is None:
    raise RuntimeError("Instala rasterio:  python -m pip install rasterio")
if importlib.util.find_spec("pyproj") is None:
    raise RuntimeError("Instala pyproj:    python -m pip install pyproj")

import rasterio
from rasterio.transform import Affine
from pyproj import Transformer

# =========================
# Utilidades
# =========================
def parse_ts_from_name(name: str):
    """
    Extrae 'YYYYMMDDhhmmss' del nombre de archivo (p.ej. ..._20250702031314_SKT.tif).
    Devuelve pandas.Timestamp(tz=UTC) o None.
    """
    m = re.search(r"_(\d{14})(?:_|\.|$)", name)
    if not m:
        return None
    s = m.group(1)
    return pd.Timestamp(f"{s[:4]}-{s[4:6]}-{s[6:8]} {s[8:10]}:{s[10:12]}:{s[12:]}", tz="UTC")

def sst_from_array(arr: np.ndarray, tags: dict, mode: str):
    """
    Convierte DN a SST según 'mode'. Por defecto 'raw' (sin cambios).
    Si mode termina en '_from_tags', usa scale_factor/add_offset si están presentes.
    """
    arr = np.asarray(arr, dtype="float32")
    if mode == "raw":
        return arr

    # Lee posibles tags de escala/desplazamiento (si existen)
    sf = None; off = None
    for k in ("scale_factor", "SCALE", "Scale", "ScaleFactor"):
        if k in tags:
            try: sf = float(tags[k]); break
            except: pass
    for k in ("add_offset", "OFFSET", "Offset"):
        if k in tags:
            try: off = float(tags[k]); break
            except: pass
    if sf is None:  sf = 1.0
    if off is None: off = 0.0

    kelvin = arr * sf + off
    if mode == "kelvin_from_tags":
        return kelvin
    if mode == "celsius_from_tags":
        return kelvin - 273.15
    return arr  # fallback

def qa_mask_for(path: Path):
    """
    Busca un QA tif hermano ( *_QA_DataPlane*.tif ).
    Devuelve (mask_bool | None).
    """
    stem = path.name.replace("_SKT.tif", "")
    candidates = list(path.parent.glob(stem + "_QA_DataPlane*.tif"))
    if not candidates:
        return None
    # Tomamos el primero
    qa_path = candidates[0]
    with rasterio.open(qa_path) as qsrc:
        q = qsrc.read(1)  # asumimos misma georeferencia/tamaño
    # Heurística simple: máscara binaria (ajusta según documentes los flags)
    mask = (q == QA_GOOD_VALUE)
    return mask

def write_parquet_block(df: pd.DataFrame):
    df = df.copy()
    df["ts"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
    df["year"] = df["ts"].dt.year.astype("int16")
    df["month"] = df["ts"].dt.month.astype("int8")
    table = pa.Table.from_pandas(df[["lat","lon","timestamp","sst","year","month"]], preserve_index=False)
    pq.write_to_dataset(table,
                        root_path=OUT_PARQ,
                        partition_cols=["year","month"],
                        compression="snappy")

# =========================
# Proceso
# =========================
tifs = sorted(SAMPLE_DIR.glob("*_SKT.tif"))
assert tifs, f"No hay *_SKT.tif en {SAMPLE_DIR}"

skipped = []
total_rows = 0

print(f"🗂️  Archivos: {len(tifs)}")
for tif in tqdm(tifs, desc="Procesando", unit="file"):
    try:
        ts = parse_ts_from_name(tif.name)
        if ts is None:
            skipped.append((tif.name, "sin timestamp en nombre"))
            continue

        with rasterio.open(tif) as src:
            # Datos
            arr = src.read(1, masked=False)  # uint16 típico
            tags = src.tags(1)  # tags de la banda para scale/offset si existen

            # QA (opcional)
            if USE_QA_MASK:
                m = qa_mask_for(tif)
                if m is not None and m.shape == arr.shape:
                    arr = np.where(m, arr, np.nan)

            # Construye malla de coordenadas en el CRS del raster
            h, w = src.height, src.width
            T: Affine = src.transform
            # col, row centrados en píxel
            cols = (np.arange(w, dtype="float32") + 0.5)[None, :]
            rows = (np.arange(h, dtype="float32") + 0.5)[:, None]
            # x = a*col + b*row + c ; y = d*col + e*row + f
            x = T.a * cols + T.b * rows + T.c
            y = T.d * cols + T.e * rows + T.f

            # Reproyección a lon/lat (EPSG:4326)
            transformer = Transformer.from_crs(src.crs, "EPSG:4326", always_xy=True)
            lon, lat = transformer.transform(x, y)

            # Convierte SST según modo
            sst = sst_from_array(arr, tags, SST_UNITS_MODE)

            # Aplanar y filtrar finitos
            lat_f = lat.ravel().astype("float32")
            lon_f = lon.ravel().astype("float32")
            sst_f = sst.ravel().astype("float32")

            finite = np.isfinite(lat_f) & np.isfinite(lon_f) & np.isfinite(sst_f)
            if not finite.any():
                skipped.append((tif.name, "sin valores finitos tras máscara/escala"))
                continue

            lat_f, lon_f, sst_f = lat_f[finite], lon_f[finite], sst_f[finite]

            # Submuestreo y caps
            n = lat_f.size
            if FRACTION < 1.0:
                k = max(1, int(np.ceil(n * FRACTION)))
                idx = rng.choice(n, size=k, replace=False)
                lat_f, lon_f, sst_f = lat_f[idx], lon_f[idx], sst_f[idx]
                n = k
            if (MAX_POINTS_PER_FILE is not None) and (n > MAX_POINTS_PER_FILE):
                idx = rng.choice(n, size=MAX_POINTS_PER_FILE, replace=False)
                lat_f, lon_f, sst_f = lat_f[idx], lon_f[idx], sst_f[idx]
                n = MAX_POINTS_PER_FILE

            # Escritura por bloques si es enorme
            if (CHUNK_ROWS is not None) and (n > CHUNK_ROWS):
                for i0 in range(0, n, CHUNK_ROWS):
                    i1 = min(i0 + CHUNK_ROWS, n)
                    df = pd.DataFrame({
                        "lat": lat_f[i0:i1],
                        "lon": lon_f[i0:i1],
                        "timestamp": ts,
                        "sst": sst_f[i0:i1],
                    })
                    write_parquet_block(df)
                    total_rows += (i1 - i0)
            else:
                df = pd.DataFrame({
                    "lat": lat_f,
                    "lon": lon_f,
                    "timestamp": ts,
                    "sst": sst_f,
                })
                write_parquet_block(df)
                total_rows += n

    except Exception as e:
        skipped.append((tif.name, f"falló apertura/proceso: {e!s}"))

print(f"\n✅ Parquet listo en: {OUT_PARQ}")
print(f"📈 Filas exportadas: {total_rows:,}")

if skipped:
    print("\n⚠️ Skipped:")
    for name, reason in skipped:
        print(f"  - {name}: {reason}")


📂 Origen:  C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\downloads\sst\sample
🧺 Parquet: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\sst\sample (partitioned by year/month)
🗂️  Archivos: 12


Procesando:   0%|          | 0/12 [00:00<?, ?file/s]


✅ Parquet listo en: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\sst\sample
📈 Filas exportadas: 6,972,000
