In [8]:
# EKE inspector (fixed temp handling): works with ZIP/GZIP ".nc" without renaming

from pathlib import Path
import os, zipfile, gzip, tempfile, shutil, importlib, textwrap
from tqdm.auto import tqdm

N_TO_INSPECT = 3
SHOW_XARRAY_DATASET = True  # True: muestra el repr completo de xarray (scrollable)

def find_eke_sample_dir(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    for parent in [start, *start.parents]:
        cand = parent / "downloads" / "eke" / "sample"
        if cand.is_dir():
            return cand
    raise FileNotFoundError(f"Could not find 'downloads/eke/sample' starting from {start}")

def sniff_magic(path: Path) -> str:
    with path.open("rb") as f:
        head = f.read(16)
    if head.startswith(b"\x89HDF\r\n\x1a\n"):
        return "HDF5"      # NetCDF4/HDF5
    if head.startswith(b"CDF"):
        return "NETCDF3"   # NetCDF3
    if head.startswith(b"PK"):
        return "ZIP"
    if head.startswith(b"\x1f\x8b"):
        return "GZIP"
    return "UNKNOWN"

def make_temp_nc_from_archive(src_path: Path, kind: str) -> tuple[Path, tempfile.TemporaryDirectory]:
    """Extract inner .nc to a TemporaryDirectory; return (nc_path, tmpdir). Caller must cleanup tmpdir."""
    tmpdir = tempfile.TemporaryDirectory()
    out_nc = Path(tmpdir.name) / (src_path.stem + "__real.nc")

    if kind == "ZIP":
        with zipfile.ZipFile(src_path) as zf:
            members = zf.namelist()
            nc_members = [m for m in members if m.lower().endswith(".nc")]
            if not nc_members:
                # si no hay .nc explícito, volcamos el primer miembro (raro, pero evita bloqueo)
                with zf.open(members[0]) as src, out_nc.open("wb") as dst:
                    shutil.copyfileobj(src, dst)
            else:
                with zf.open(nc_members[0]) as src, out_nc.open("wb") as dst:
                    shutil.copyfileobj(src, dst)
    elif kind == "GZIP":
        with gzip.open(src_path, "rb") as src, out_nc.open("wb") as dst:
            shutil.copyfileobj(src, dst)
    else:
        tmpdir.cleanup()
        raise ValueError(f"Unsupported kind for temp extraction: {kind}")

    return out_nc, tmpdir

def try_open_xarray(nc_path: Path):
    import xarray as xr
    tried, last_err = [], None
    for eng in ("netcdf4", "h5netcdf", "scipy"):
        if eng == "netcdf4" and importlib.util.find_spec("netCDF4") is None:
            continue
        if eng == "h5netcdf" and importlib.util.find_spec("h5netcdf") is None:
            continue
        try:
            ds = xr.open_dataset(nc_path, engine=eng, decode_cf=True, mask_and_scale=True)
            return ds, eng
        except Exception as e:
            tried.append(eng); last_err = e
    raise RuntimeError(f"xarray failed with engines {tried}. Last error: {last_err}")

def summarize_xarray(ds):
    if SHOW_XARRAY_DATASET:
        display(ds)
    print("\n— Compact summary —")
    try:
        print("dims:", dict(ds.dims))
        print("coords:", list(ds.coords))
        print("data_vars:", list(ds.data_vars))
        if ds.attrs:
            print("global attrs (keys):", list(ds.attrs.keys()))
    except Exception as e:
        print("  (summary failed)", e)

def list_vars_shapes(ds):
    # inventario compacto de variables con dims/shape
    print("\n— Variables inventory —")
    try:
        for name, v in ds.data_vars.items():
            try:
                print(f"  {name:35s} dims={tuple(v.dims)} shape={tuple(v.shape)} dtype={str(v.dtype)}")
            except Exception:
                print(f"  {name}")
    except Exception as e:
        print("  (inventory failed)", e)

def summarize_netcdf4(nc_path: Path):
    try:
        from netCDF4 import Dataset
        with Dataset(nc_path, "r") as ds:
            print("🔎 netCDF4 summary")
            print("dims:", {k: len(v) for k, v in ds.dimensions.items()})
            print("vars:", list(ds.variables.keys())[:80])
            if ds.ncattrs():
                print("global attrs:", list(ds.ncattrs())[:40])
    except Exception as e:
        print("⚠️ netCDF4 summary failed:", e)

def summarize_h5py(nc_path: Path):
    try:
        import h5py, re
        with h5py.File(nc_path, "r") as f:
            print("🔎 HDF5 groups/datasets (first 120):")
            count = 0
            for name, obj in f.items():
                print(f"[G] {name}")
            def visit(name, obj):
                nonlocal count
                if count >= 120: return
                if isinstance(obj, h5py.Dataset):
                    print(f"[D] {name} shape={obj.shape} dtype={obj.dtype}")
                    count += 1
            f.visititems(lambda n,o: visit(n,o))
            print("\n— Name hints (eke/vel/ssh/sla/lat/lon/time) —")
            pat = re.compile(r"(eke|u|v|vel|ssh|sla|adt|lat|lon|time)", re.I)
            hints = []
            def collect(n, o):
                if isinstance(o, h5py.Dataset) and pat.search(n):
                    hints.append((n, o.shape, str(o.dtype)))
            f.visititems(collect)
            for n, sh, dt in hints[:40]:
                print(f"  ~ {n}  shape={sh} dtype={dt}")
    except Exception as e:
        print("⚠️ h5py summary failed:", e)

def fmt_size(b):
    for unit in ("B","KB","MB","GB","TB"):
        if b < 1024 or unit == "TB":
            return f"{b:,.1f} {unit}"
        b /= 1024.0

# ---------- Main ----------
sample_dir = find_eke_sample_dir()
nc_files = sorted(sample_dir.glob("*.nc"))
assert nc_files, f"No .nc files in {sample_dir}"

print(f"📂 Folder: {sample_dir}")
print(f"🗂️  Found: {len(nc_files)} files (showing up to {N_TO_INSPECT})\n")

inspected = 0
tmp_dirs_to_cleanup = []

for p in tqdm(nc_files[:N_TO_INSPECT], desc="Inspecting", unit="file"):
    kind = sniff_magic(p)
    size = fmt_size(os.path.getsize(p))
    print("\n" + "─" * 70)
    print(f"📄 {p.name}  ({size})  | magic={kind}")

    work_path = p
    tmpdir = None
    if kind in ("ZIP", "GZIP"):
        try:
            work_path, tmpdir = make_temp_nc_from_archive(p, kind)
            inner_kind = sniff_magic(work_path)
            print(f"📦 Extracted to temp: {work_path.name}  | inner_kind={inner_kind}")
            tmp_dirs_to_cleanup.append(tmpdir)
        except Exception as e:
            print(f"❌ Failed to extract from {kind}: {e}")
            continue

    # Try xarray → netCDF4 → h5py
    try:
        ds, eng = try_open_xarray(work_path)
        print(f"✅ xarray.open_dataset OK (engine='{eng}')")
        summarize_xarray(ds)
        list_vars_shapes(ds)
        try: ds.close()
        except: pass
    except Exception as e:
        print("⚠️ xarray failed:", textwrap.shorten(str(e), width=140))
        if importlib.util.find_spec("netCDF4"):
            summarize_netcdf4(work_path)
        elif importlib.util.find_spec("h5py"):
            summarize_h5py(work_path)
        else:
            print("ℹ️ Install one of: netCDF4 or h5py for detailed summary.")

    inspected += 1

# cleanup temps
for td in tmp_dirs_to_cleanup:
    try: td.cleanup()
    except: pass

print(f"\n✅ Done. Inspected {inspected} file(s).")


📂 Folder: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\downloads\eke\sample
🗂️  Found: 30 files (showing up to 3)



Inspecting:   0%|          | 0/3 [00:00<?, ?file/s]


──────────────────────────────────────────────────────────────────────
📄 satellite-sea-level-global_20140320.nc  (7.4 MB)  | magic=ZIP
📦 Extracted to temp: satellite-sea-level-global_20140320__real.nc  | inner_kind=HDF5
✅ xarray.open_dataset OK (engine='netcdf4')



— Compact summary —
dims: {'latitude': 720, 'nv': 2, 'longitude': 1440, 'time': 1}
coords: ['time', 'latitude', 'longitude', 'nv']
data_vars: ['crs', 'lat_bnds', 'lon_bnds', 'err', 'adt', 'ugos', 'vgos', 'sla', 'ugosa', 'vgosa']
global attrs (keys): ['Conventions', 'Metadata_Conventions', 'cdm_data_type', 'comment', 'contact', 'creator_email', 'creator_name', 'creator_url', 'date_created', 'date_issued', 'date_modified', 'geospatial_lat_max', 'geospatial_lat_min', 'geospatial_lat_resolution', 'geospatial_lat_units', 'geospatial_lon_max', 'geospatial_lon_min', 'geospatial_lon_resolution', 'geospatial_lon_units', 'geospatial_vertical_max', 'geospatial_vertical_min', 'geospatial_vertical_positive', 'geospatial_vertical_resolution', 'geospatial_vertical_units', 'history', 'institution', 'keywords', 'keywords_vocabulary', 'license', 'platform', 'processing_level', 'product_version', 'project', 'references', 'software_version', 'source', 'ssalto_duacs_comment', 'standard_name_vocabulary', '

  print("dims:", dict(ds.dims))



— Compact summary —
dims: {'latitude': 720, 'nv': 2, 'longitude': 1440, 'time': 1}
coords: ['time', 'latitude', 'longitude', 'nv']
data_vars: ['crs', 'lat_bnds', 'lon_bnds', 'err', 'adt', 'ugos', 'vgos', 'sla', 'ugosa', 'vgosa']
global attrs (keys): ['Conventions', 'Metadata_Conventions', 'cdm_data_type', 'comment', 'contact', 'creator_email', 'creator_name', 'creator_url', 'date_created', 'date_issued', 'date_modified', 'geospatial_lat_max', 'geospatial_lat_min', 'geospatial_lat_resolution', 'geospatial_lat_units', 'geospatial_lon_max', 'geospatial_lon_min', 'geospatial_lon_resolution', 'geospatial_lon_units', 'geospatial_vertical_max', 'geospatial_vertical_min', 'geospatial_vertical_positive', 'geospatial_vertical_resolution', 'geospatial_vertical_units', 'history', 'institution', 'keywords', 'keywords_vocabulary', 'license', 'platform', 'processing_level', 'product_version', 'project', 'references', 'software_version', 'source', 'ssalto_duacs_comment', 'standard_name_vocabulary', '

  print("dims:", dict(ds.dims))



— Compact summary —
dims: {'latitude': 720, 'nv': 2, 'longitude': 1440, 'time': 1}
coords: ['time', 'latitude', 'longitude', 'nv']
data_vars: ['crs', 'lat_bnds', 'lon_bnds', 'err', 'adt', 'ugos', 'vgos', 'sla', 'ugosa', 'vgosa']
global attrs (keys): ['Conventions', 'Metadata_Conventions', 'cdm_data_type', 'comment', 'contact', 'creator_email', 'creator_name', 'creator_url', 'date_created', 'date_issued', 'date_modified', 'geospatial_lat_max', 'geospatial_lat_min', 'geospatial_lat_resolution', 'geospatial_lat_units', 'geospatial_lon_max', 'geospatial_lon_min', 'geospatial_lon_resolution', 'geospatial_lon_units', 'geospatial_vertical_max', 'geospatial_vertical_min', 'geospatial_vertical_positive', 'geospatial_vertical_resolution', 'geospatial_vertical_units', 'history', 'institution', 'keywords', 'keywords_vocabulary', 'license', 'platform', 'processing_level', 'product_version', 'project', 'references', 'software_version', 'source', 'ssalto_duacs_comment', 'standard_name_vocabulary', '

  print("dims:", dict(ds.dims))
