In [None]:
# ICESat-2 ATL24 (.h5) -> lat, lon, timestamp, depth  (incremental Parquet, partitioned by year/month)

from pathlib import Path
import os
from datetime import datetime, timezone, timedelta
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

# --------------------------
# Locate input (downloads/depth/sample) and output (<repo_root>/transform/depth/parquet)
# --------------------------
def find_depth_sample_dir(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    for parent in [start, *start.parents]:
        cand = parent / "downloads" / "depth" / "sample"
        if cand.is_dir():
            return cand
    raise FileNotFoundError(f"Could not find 'downloads/depth/sample' starting from {start}")

SAMPLE_DIR = find_depth_sample_dir()
REPO_ROOT  = SAMPLE_DIR.parents[2]  # .../downloads/depth/sample -> go up 3 levels
OUT_PARQ   = (REPO_ROOT / "transform" / "depth" / "sample")
OUT_PARQ.mkdir(parents=True, exist_ok=True)

print(f"📂 Input:  {SAMPLE_DIR}")
print(f"🧺 Output Parquet (partitioned): {OUT_PARQ}")

# --------------------------
# Config
# --------------------------
BEAMS = ("gt1l","gt1r","gt2l","gt2r","gt3l","gt3r")
LAT_NAME = "lat_ph"
LON_NAME = "lon_ph"
DELTA_NAME = "delta_time"

# Choose which height to treat as "depth". We export depth = SIGN * height_var
DEPTH_VAR_CANDIDATES = ("surface_h", "ortho_h", "ellipse_h")
DEPTH_SIGN = -1.0  # if height is "up" (meters), depth below surface as negative height -> use -1

# Optional quality filters (disabled by default)
MIN_CONFIDENCE = None         # e.g., 0.5 to keep high confidence only (uses beam/<confidence>)
FILTER_LOW_CONF_FLAG = False  # if True and beam/low_confidence_flag exists, keep only 0
FILTER_SENSOR_FLAG = True     # if True and beam/sensor_depth_exceeded exists, keep only 0

# Downsampling / memory control
FRACTION = 1.0                # 0<frac<=1 to randomly keep a fraction of points
RANDOM_SEED = 42
MAX_POINTS_PER_BEAM = None    # e.g., 1_000_000 to cap per-beam rows
CHUNK_ROWS = 2_000_000        # if a beam has > CHUNK_ROWS, write in chunks

COMPRESSION = "snappy"        # snappy|zstd|gzip

rng = np.random.default_rng(RANDOM_SEED)

# --------------------------
# Helpers
# --------------------------
def parse_utc(s: str | bytes | np.ndarray):
    """Parse ISO-like UTC string(s) such as '2019-02-12T03:04:05.123Z' into timezone-aware datetime."""
    if isinstance(s, (bytes, np.bytes_)):
        s = s.decode("utf-8", errors="ignore")
    if isinstance(s, np.ndarray) and s.shape == ():
        s = s.item()
        if isinstance(s, (bytes, np.bytes_)):
            s = s.decode("utf-8", errors="ignore")
    s = str(s).strip().replace("Z", "+00:00")
    return datetime.fromisoformat(s).astimezone(timezone.utc)

def read_scalar(f, path):
    """Read scalar dataset if exists, else return None."""
    try:
        dset = f[path]
        val = dset[()]  # scalar
        return val
    except Exception:
        return None

def choose_depth_var(f, beam):
    """Pick the first available height variable to map to depth."""
    for v in DEPTH_VAR_CANDIDATES:
        p = f"{beam}/{v}"
        if p in f:
            return v
    return None

def write_parquet_block(df: pd.DataFrame):
    """Write a pandas DataFrame block to the Parquet dataset partitioned by year/month."""
    # Ensure proper dtypes
    df["ts"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
    df["year"] = df["ts"].dt.year.astype("int16")
    df["month"] = df["ts"].dt.month.astype("int8")
    table = pa.Table.from_pandas(df[["lat","lon","timestamp","depth","year","month"]], preserve_index=False)
    pq.write_to_dataset(table, root_path=OUT_PARQ, partition_cols=["year","month"], compression=COMPRESSION)

# --------------------------
# Processing
# --------------------------
import h5py  # use h5py for robust HDF5 access

files = sorted(list(SAMPLE_DIR.glob("*.h5")) + list(SAMPLE_DIR.glob("*.hdf5")))
assert files, f"No .h5/.hdf5 files found in {SAMPLE_DIR}"

skipped = []

for path in tqdm(files, desc="Files", unit="file"):
    try:
        with h5py.File(path, "r") as f:
            # --- Granule timing anchors (robust per-point UTC via delta_time offset) ---
            # We use: timestamp = granule_start_utc + (delta_time - start_delta_time) seconds
            # This avoids knowing the absolute GPS epoch/leap seconds.
            start_utc_raw = read_scalar(f, "ancillary_data/granule_start_utc") or read_scalar(f, "ancillary_data/data_start_utc")
            end_utc_raw   = read_scalar(f, "ancillary_data/granule_end_utc")   or read_scalar(f, "ancillary_data/data_end_utc")
            start_delta   = read_scalar(f, "ancillary_data/start_delta_time")

            if start_utc_raw is None or start_delta is None:
                skipped.append((path.name, "missing ancillary_data/granule_start_utc or start_delta_time"))
                continue

            t0 = parse_utc(start_utc_raw)  # timezone-aware datetime
            t0_pd = pd.Timestamp(t0)       # pandas Timestamp with tz UTC

            # --- Iterate over beams present in file ---
            for beam in BEAMS:
                if beam not in f:
                    continue
                g = f[beam]

                # Required datasets
                req = {}
                for k in (LAT_NAME, LON_NAME, DELTA_NAME):
                    if k in g:
                        req[k] = g[k][...]
                    else:
                        req[k] = None

                if req[LAT_NAME] is None or req[LON_NAME] is None or req[DELTA_NAME] is None:
                    skipped.append((path.name, f"{beam}: missing {LAT_NAME}/{LON_NAME}/{DELTA_NAME}"))
                    continue

                # Depth source
                depth_var = choose_depth_var(f, beam)
                if depth_var is None:
                    skipped.append((path.name, f"{beam}: no depth-like var among {DEPTH_VAR_CANDIDATES}"))
                    continue
                depth_arr = g[depth_var][...]

                lat = np.asarray(req[LAT_NAME])
                lon = np.asarray(req[LON_NAME])
                dlt = np.asarray(req[DELTA_NAME])  # seconds since mission epoch
                # Per-point UTC: start + (delta - start_delta) seconds
                ts = t0_pd + pd.to_timedelta(dlt - float(start_delta), unit="s")

                # Optional quality masks
                mask = np.isfinite(lat) & np.isfinite(lon) & np.isfinite(depth_arr)
                if FILTER_SENSOR_FLAG and "sensor_depth_exceeded" in g:
                    mask &= (np.asarray(g["sensor_depth_exceeded"][...]) == 0)
                if FILTER_LOW_CONF_FLAG and "low_confidence_flag" in g:
                    mask &= (np.asarray(g["low_confidence_flag"][...]) == 0)
                if MIN_CONFIDENCE is not None and "confidence" in g:
                    mask &= (np.asarray(g["confidence"][...]) >= float(MIN_CONFIDENCE))

                lat = lat[mask]; lon = lon[mask]; depth = depth_arr[mask]; ts = ts[mask]

                # Random downsampling
                n = lat.size
                if n == 0:
                    continue
                if FRACTION < 1.0:
                    k = max(1, int(np.ceil(n * FRACTION)))
                    idx = rng.choice(n, size=k, replace=False)
                    lat, lon, depth, ts = lat[idx], lon[idx], depth[idx], ts[idx]
                    n = k

                # Cap rows per beam
                if (MAX_POINTS_PER_BEAM is not None) and (n > MAX_POINTS_PER_BEAM):
                    idx = rng.choice(n, size=MAX_POINTS_PER_BEAM, replace=False)
                    lat, lon, depth, ts = lat[idx], lon[idx], depth[idx], ts[idx]
                    n = MAX_POINTS_PER_BEAM

                # Sign convention -> depth (meters, positive-down if DEPTH_SIGN = -1 and heights are positive-up)
                depth = DEPTH_SIGN * depth

                # Chunked write
                if (CHUNK_ROWS is not None) and (n > CHUNK_ROWS):
                    for i0 in range(0, n, CHUNK_ROWS):
                        i1 = min(i0 + CHUNK_ROWS, n)
                        df = pd.DataFrame({
                            "lat": lat[i0:i1],
                            "lon": lon[i0:i1],
                            "timestamp": ts[i0:i1],
                            "depth": depth[i0:i1],
                        })
                        write_parquet_block(df)
                else:
                    df = pd.DataFrame({
                        "lat": lat,
                        "lon": lon,
                        "timestamp": ts,
                        "depth": depth,
                    })
                    write_parquet_block(df)

    except Exception as e:
        skipped.append((path.name, f"open/process failed: {e}"))

print(f"\n✅ Parquet dataset written to: {OUT_PARQ}")
if skipped:
    print("\n⚠️ Skipped:")
    for name, reason in skipped:
        print(f"  - {name}: {reason}")


📂 Input:  C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\downloads\depth\sample
🧺 Output Parquet (partitioned): C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\depth\parquet


Files:   0%|          | 0/1 [00:00<?, ?file/s]


✅ Parquet dataset written to: C:\Users\Crist\Desktop\NASA\tag-and-satellite-data-model\transform\depth\parquet
