In [1]:
import os
import re
import xarray as xr
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

In [2]:
# ========= User settings =========
input_root = "/mnt/cephfs-mount/chenchen/CygnssData/"   # monthly subfolders like 2018_08 ... 2024_04 CHANGE YOUR OWN PATH HERE!!!
output_root = "/mnt/cephfs-mount/chenchen/CygnssDataCsv"  # output root
years_to_process = [2025]  # e.g., [2023, 2024] or None for all years CHANGE YOUR OWN PATH HERE!!!
 
# ROI (fast approx): major/minor axes (km)
ROI_MAJOR_KM = 3.5
ROI_MINOR_KM = 0.5

In [3]:
# ========= Variables expected in each NetCDF =========
# sp_theta_orbit is optional; we handle its absence gracefully
required_vars = [
    "sample", "ddm", "sp_lon", "sp_lat", "ddm_snr",
    "gps_tx_power_db_w", "tx_to_sp_range", "rx_to_sp_range",
    "sp_inc_angle", "gps_ant_gain_db_i"
]

In [4]:
# ========= Helpers =========
import os, re
from collections import defaultdict

def parse_month_folder_name(name):
    # Accept 201808 / 2018_08 / 2018-08
    m = re.match(r'^(\d{4})[-_]?(\d{2})$', name)
    if not m: return None, None
    y, mm = int(m.group(1)), int(m.group(2))
    return (y, mm) if 1 <= mm <= 12 else (None, None)

def normalize_years(years_to_process):
    return None if years_to_process is None else {int(y) for y in years_to_process}

def find_month_dirs(input_root, years_to_process=None):
    """Return list of tuples: (name, year, month, full_path)"""
    yfilter = normalize_years(years_to_process)
    out = []

    # Case A: month folders directly under input_root (e.g., 201808)
    for name in sorted(os.listdir(input_root)):
        p = os.path.join(input_root, name)
        if not os.path.isdir(p): 
            continue
        y, m = parse_month_folder_name(name)
        if y is not None and (yfilter is None or y in yfilter):
            out.append((name, y, m, p))

    if out:
        return out  # already found direct month folders

    # Case B: year subfolders (e.g., input_root/2018/201808 or /08)
    for yname in sorted(os.listdir(input_root)):
        ypath = os.path.join(input_root, yname)
        if not os.path.isdir(ypath): 
            continue
        # year folder like 2018
        my = re.match(r'^(\d{4})$', yname)
        if not my: 
            continue
        y = int(my.group(1))
        if yfilter is not None and y not in yfilter:
            continue

        for mname in sorted(os.listdir(ypath)):
            mpath = os.path.join(ypath, mname)
            if not os.path.isdir(mpath): 
                continue

            # month dir might be 201808 / 2018_08 / 2018-08 / 08
            y2, m2 = parse_month_folder_name(mname)
            if y2 is not None:  # matched 201808 / 2018_08 / 2018-08
                out.append((mname, y2, m2, mpath))
            else:
                mm = re.match(r'^(\d{2})$', mname)  # just "08"
                if mm:
                    m = int(mm.group(1))
                    if 1 <= m <= 12:
                        out.append((f"{y:04d}_{m:02d}", y, m, mpath))

    return out


def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)



def extract_date_from_filename(fname: str):
    """
    Find token 'sYYYYMMDD' anywhere in the filename.
    e.g., cyg08.ddmi.s20240403-000000-e20240403-235959.l1.power-brcs.a31.d32.nc
    Returns 'YYYYMMDD' or None.
    """
    m = re.search(r's(\d{8})', fname)
    return m.group(1) if m else None


def open_nc_safely(path: str):
    """
    Open a NetCDF dataset with xarray; caller must close() when done.
    Use defaults; if needed, you can specify engine='netcdf4' and mask_and_scale=True.
    """
    return xr.open_dataset(path)  # add engine='netcdf4', mask_and_scale=True if desired


def roi_corners(sp_lon, sp_lat, roi_major_km, roi_minor_km, sp_theta_orbit):
    """
    Compute ROI rectangle corners (lon, lat) given center point, size and orientation.
    Angle convention: 0 = east, positive counter-clockwise.
    """
    # Convert axis lengths to meters (half-lengths)
    a = roi_major_km * 1000 / 2.0
    b = roi_minor_km * 1000 / 2.0

    # Rotation angle in radians
    theta = np.deg2rad(sp_theta_orbit)

    # Local meters-to-degrees scale factors
    lat_rad = np.deg2rad(sp_lat)
    m_per_deg_lat = 111132.92 - 559.82*np.cos(2*lat_rad) + 1.175*np.cos(4*lat_rad)
    m_per_deg_lon = 111412.84*np.cos(lat_rad) - 93.5*np.cos(3*lat_rad)

    # Rectangle in local ENU coordinates (before rotation)
    rect = np.array([
        [ a,  b],
        [-a,  b],
        [-a, -b],
        [ a, -b]
    ])

    # Rotation matrix (theta from east CCW)
    R = np.array([
        [np.cos(theta), -np.sin(theta)],
        [np.sin(theta),  np.cos(theta)]
    ])
    rect_rot = rect @ R.T

    # Convert to lon/lat offsets
    dlon = rect_rot[:, 0] / m_per_deg_lon
    dlat = rect_rot[:, 1] / m_per_deg_lat

    corners = [(sp_lon + dlon[i], sp_lat + dlat[i]) for i in range(4)]
    return corners
 
def dataset_to_rows(ds):
    """
    Convert required arrays from the dataset to a list of rows.
    Each row also includes ROI polygon WKT computed from (sp_lon, sp_lat, sp_theta_orbit).
    Returns (rows: list[list], ok: bool). ok=False if any variable missing or shape mismatch.
    """
    try:
        samples = ds['sample'].values          # (Nsamples,)
        ddms = ds['ddm'].values                # (Nddm,)
        sp_lon = ds['sp_lon'].values           # (Nsamples, Nddm)
        sp_lat = ds['sp_lat'].values           # (Nsamples, Nddm)
        ddm_snr = ds['ddm_snr'].values         # (Nsamples, Nddm)
        gps_tx_power = ds['gps_tx_power_db_w'].values     # (Nsamples, Nddm)
        tx_to_sp = ds['tx_to_sp_range'].values            # (Nsamples, Nddm)
        rx_to_sp = ds['rx_to_sp_range'].values            # (Nsamples, Nddm)
        sp_inc_angle = ds['sp_inc_angle'].values          # (Nsamples, Nddm)
        gps_ant_gain = ds['gps_ant_gain_db_i'].values     # (Nsamples, Nddm)

        # ROI orientation (optional)
        if "sp_theta_orbit" in ds.variables:
            sp_theta_orbit = ds['sp_theta_orbit'].values  # (Nsamples, Nddm)
        else:
            sp_theta_orbit = np.full_like(sp_lon, np.nan)  # fallback if not present
 
    except KeyError as e:
        print(f"  [WARN] Missing variable: {e}")
        return [], False
 
    # Basic shape checks
    if (len(samples) != sp_lon.shape[0]) or (len(ddms) != sp_lon.shape[1]):
        print("  [WARN] Dimension mismatch among variables.")
        return [], False
 
    # Normalize lon to [-180, 180]
    sp_lon = np.where(sp_lon > 180, sp_lon - 360, sp_lon)
 
    rows = []
    for i in range(len(samples)):
        for j in range(len(ddms)):
            lon = float(sp_lon[i, j])
            lat = float(sp_lat[i, j])
            angle = float(sp_theta_orbit[i, j]) if not np.isnan(sp_theta_orbit[i, j]) else np.nan

            # NEW: compute 4 corners directly (or NaNs if angle missing)
            if np.isnan(angle):
                c1 = c2 = c3 = c4 = (np.nan, np.nan)
            else:
                corners = roi_corners(lon, lat, ROI_MAJOR_KM, ROI_MINOR_KM, angle)
                c1, c2, c3, c4 = corners  # each is (lon, lat)

            rows.append([
                samples[i], ddms[j], lon, lat, ddm_snr[i, j],
                gps_tx_power[i, j], tx_to_sp[i, j], rx_to_sp[i, j],
                sp_inc_angle[i, j], gps_ant_gain[i, j],
                angle, ROI_MAJOR_KM, ROI_MINOR_KM,
                c1[0], c1[1], c2[0], c2[1], c3[0], c3[1], c4[0], c4[1]
            ])

    return rows, True
 
 
def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)
 
 
# ========= Main =========
def main():
    ensure_dir(output_root)

    month_folders = find_month_dirs(input_root, years_to_process)

    if not month_folders:
        print("[INFO] No month folders matched your year filter.")
        existing = [d for d in sorted(os.listdir(input_root)) if os.path.isdir(os.path.join(input_root, d))]
        print("[HINT] Top-level dirs under input_root:", existing)
        print("[HINT] Expect either month dirs like 201808, or year/mon like 2018/201808 or 2018/08.")
        print(f"[HINT] Current year filter = {sorted(normalize_years(years_to_process)) if years_to_process else 'None'}")
        return

    for name, y, m, full_month_path in month_folders:
        print(f"\n=== Processing month: {name} ===")

        # Group files by date (YYYYMMDD)
        daily_files = defaultdict(list)
        for fname in sorted(os.listdir(full_month_path)):
            if fname.endswith(".nc") and "cyg" in fname:
                date_str = extract_date_from_filename(fname)
                if date_str:
                    daily_files[date_str].append(os.path.join(full_month_path, fname))
                else:
                    print(f"  [WARN] Could not find sYYYYMMDD in filename: {fname}")

        if not daily_files:
            print("  [INFO] No NetCDF files found with recognizable dates in this month.")
            continue

        # Process each day within this month
        for date_str, file_list in tqdm(daily_files.items(), desc=f"{name}: per-day"):
            daily_rows = []

            for nc_file in file_list:
                try:
                    ds = open_nc_safely(nc_file)
                except Exception as e:
                    print(f"  [WARN] Failed to open {nc_file}: {e}")
                    continue

                try:
                    rows, ok = dataset_to_rows(ds)
                    if ok:
                        daily_rows.extend(rows)
                finally:
                    try:
                        ds.close()
                    except Exception:
                        pass

            if not daily_rows:
                print(f"  [INFO] No valid rows for {date_str}")
                continue
 
            # Build DataFrame
            df = pd.DataFrame(
                daily_rows,
                columns=[
                    'sample', 'ddm', 'sp_lon', 'sp_lat', 'ddm_snr',
                    'gps_tx_power_db_w', 'tx_to_sp_range', 'rx_to_sp_range',
                    'sp_inc_angle', 'gps_ant_gain_db_i',
                    'sp_theta_orbit', 'roi_major_km', 'roi_minor_km', 'c1_lon','c1_lat','c2_lon','c2_lat','c3_lon','c3_lat','c4_lon','c4_lat'
                ]
            )


            # Drop full-NaN rows (defensive; usually none after above checks)
            df.dropna(how="all", inplace=True)

            # Prepare output path (month-by-month)
            out_month_folder = os.path.join(output_root, f"{y:04d}_{m:02d}")
            ensure_dir(out_month_folder)
            out_csv = os.path.join(out_month_folder, f"{date_str}.csv")

            try:
                df.to_csv(out_csv, index=False)
                # print(f"  [OK] Saved {out_csv} ({len(df):,} rows)")
            except Exception as e:
                print(f"  [ERR] Failed to save {out_csv}: {e}")

    print("\n[DONE] All requested years processed.")


if __name__ == "__main__":
    main()


=== Processing month: 202501 ===


202501: per-day: 100%|████████████████████████| 31/31 [1:51:45<00:00, 216.30s/it]



=== Processing month: 202502 ===


202502: per-day: 100%|████████████████████████| 28/28 [1:37:50<00:00, 209.67s/it]



=== Processing month: 202503 ===


202503: per-day: 100%|████████████████████████| 31/31 [1:48:48<00:00, 210.60s/it]



=== Processing month: 202504 ===


202504: per-day: 100%|████████████████████████| 30/30 [1:43:55<00:00, 207.84s/it]



=== Processing month: 202505 ===


202505: per-day: 100%|████████████████████████| 31/31 [1:47:01<00:00, 207.15s/it]



=== Processing month: 202506 ===


202506: per-day: 100%|████████████████████████| 30/30 [1:44:27<00:00, 208.91s/it]



=== Processing month: 202507 ===


202507: per-day: 100%|████████████████████████| 31/31 [1:50:24<00:00, 213.71s/it]



[DONE] All requested years processed.
