In [2]:
#!/usr/bin/env python3
import os
import re
import math
import pandas as pd
import numpy as np
import ee
from tqdm import tqdm

# ---------------- EE auth ----------------
ee.Authenticate()   # comment this out if creds are already cached in your env
ee.Initialize()

# ---------------- Month folders ----------
INPUT_MONTH_DIR  = r"/mnt/cephfs-mount/chenchen/CygnssDataCsvLand/2021_02"
OUTPUT_MONTH_DIR = r"/mnt/cephfs-mount/chenchen/water_body_fraction/2021_02_testmonth"
os.makedirs(OUTPUT_MONTH_DIR, exist_ok=True)

# ---------------- Sampling ----------------
SAMPLE_FRAC = 1   # 1% sample
SAMPLE_SEED = 42     # keep this fixed for reproducibility

# ---------------- Debug toggles ----------
DEBUG_MODE     = False
PRINT_PER_ROW  = False
SKIP_IF_EXISTS = True  # set True to skip days that already have an output file

# ---------------- S1 search settings -----
S1_COLLECTION        = "COPERNICUS/S1_GRD"
S1_ACCEPT_MODES      = ("IW", "EW")                 # accept both (skip WV)
S1_POLS_PREFERENCE   = ("VV", "VH", "HH", "HV")     # try in this order
S1_DAY_WINDOWS_TRY   = [0]                    # exact day, ±3d, ±6d
S1_SCALE             = 10                           # meters

# ---------------- Threshold + histogram ---
VV_MIN_DB        = -30
VV_MAX_DB        = 5
VV_BUCKETS       = 256
DEFAULT_THRESHOLD_DB = -18.5
ALL_WATER_DB     = -19.0
ALL_LAND_DB      = -15.0
OTSU_CLAMP       = (-23.0, -15.0)

# ---------------- Slope mask -------------
SLOPE_DEG_MAX = 5.0
_SLOPE_MASK = None  # lazy cache

# ------------- Helpers -------------------
_YMD_RE = re.compile(r"(\d{8})\.csv$")

def date_from_csv_path(path):
    """Parse YYYYMMDD from filename like 20180801.csv -> '2018-08-01'."""
    base = os.path.basename(path)
    m = _YMD_RE.match(base)
    if not m:
        raise ValueError(f"Cannot parse date from filename: {base}")
    ymd = m.group(1)
    return f"{ymd[:4]}-{ymd[4:6]}-{ymd[6:8]}"

def _load_s1_image(date_ymd, region):
    """
    Try multiple windows, modes, polarizations. Return single-band image 'S1' or None.
    """
    d0 = ee.Date(date_ymd)
    last_err = None
    for wnd in S1_DAY_WINDOWS_TRY:
        start = d0.advance(-wnd, 'day')
        end   = d0.advance(wnd + 1, 'day')
        for pol in S1_POLS_PREFERENCE:
            col = (ee.ImageCollection(S1_COLLECTION)
                   .filterBounds(region)
                   .filterDate(start, end)
                   .filter(ee.Filter.inList('instrumentMode', list(S1_ACCEPT_MODES)))
                   .filter(ee.Filter.listContains('transmitterReceiverPolarisation', pol)))
            try:
                n = col.size().getInfo()
            except Exception as e:
                last_err = e
                n = 0
            if n and n > 0:
                img = col.mosaic().select(pol).rename('S1').set({
                    'used_pol': pol, 'used_window_days': wnd
                })
                if DEBUG_MODE:
                    tqdm.write(f"[S1] {date_ymd}: {n} img(s), pol={pol}, ±{wnd}d")
                return img
    if DEBUG_MODE:
        # tqdm.write(f"[S1] {date_ymd}: no images found" + (f" (last err: {last_err})" if last_err else ""))
        pass
    return None

def _build_slope_mask():
    srtm = ee.Image("USGS/SRTMGL1_003")
    slope = ee.Terrain.slope(srtm)
    return slope.lt(SLOPE_DEG_MAX).unmask(1)

def _get_histogram(img_s1, region):
    """
    Return {'bucketMeans','histogram'} for 'S1' band, or None.
    Clamp image to [VV_MIN_DB, VV_MAX_DB].
    """
    img_clamped = img_s1.clamp(VV_MIN_DB, VV_MAX_DB)
    hist = (img_clamped.reduceRegion(
        reducer=ee.Reducer.histogram(VV_BUCKETS),
        geometry=region,
        scale=S1_SCALE,
        bestEffort=True,
        maxPixels=1e13
    ).get('S1'))
    try:
        info = hist.getInfo()
    except Exception:
        return None
    if not info or "bucketMeans" not in info or "histogram" not in info:
        return None
    return info

def _count_peaks(bucket_means, counts):
    bm = np.asarray(bucket_means, dtype=float)
    ct = np.asarray(counts, dtype=float)
    if len(ct) < 3:
        return 0, []
    from numpy import convolve
    kernel = np.array([1,2,3,2,1], dtype=float); kernel /= kernel.sum()
    ct_s = convolve(ct, kernel, mode='same')
    thresh = 0.05 * ct_s.max() if ct_s.max() > 0 else np.inf
    peaks = []
    for i in range(1, len(ct_s)-1):
        if ct_s[i] > ct_s[i-1] and ct_s[i] > ct_s[i+1] and ct_s[i] >= thresh:
            peaks.append(i)
    return len(peaks), peaks

def _otsu_threshold(bucket_means, counts):
    bm = np.asarray(bucket_means, dtype=float)
    ct = np.asarray(counts, dtype=float)
    ct = np.maximum(ct, 0)
    if ct.sum() == 0:
        return None
    prob = ct / ct.sum()
    omega = np.cumsum(prob)
    mu = np.cumsum(prob * bm)
    mu_t = mu[-1]
    sigma_b2 = (mu_t * omega - mu)**2 / (omega * (1 - omega) + 1e-12)
    idx = np.nanargmax(sigma_b2)
    thr = bm[idx]
    return float(thr)

def _choose_threshold_from_hist(hist):
    bm = hist["bucketMeans"]; ct = hist["histogram"]
    n_peaks, peak_idx = _count_peaks(bm, ct)
    if n_peaks == 0:
        return ("default", DEFAULT_THRESHOLD_DB)
    if n_peaks == 1:
        pk_val = bm[peak_idx[0]]
        if pk_val <= ALL_WATER_DB: return ("all_water", DEFAULT_THRESHOLD_DB)
        if pk_val >= ALL_LAND_DB:  return ("all_land",  DEFAULT_THRESHOLD_DB)
        return ("default", DEFAULT_THRESHOLD_DB)
    if n_peaks == 2:
        thr = _otsu_threshold(bm, ct)
        if thr is None or math.isnan(thr):
            return ("default", DEFAULT_THRESHOLD_DB)
        thr = max(OTSU_CLAMP[0], min(OTSU_CLAMP[1], thr))
        return ("otsu", float(thr))
    return ("default", DEFAULT_THRESHOLD_DB)

# --------- ADVANCED water fraction (returns (frac, method, thr)) ----------
def calculate_water_fraction(region, date_ee):
    """
    Advanced method:
      - robust S1 search (modes/pols/windows)
      - histogram-based threshold (Otsu w/ clamps; fallbacks)
      - slope mask (<=5°) before averaging water mask
    Returns: (fraction in [0,1], method, thr_db) or (None, None, thr_db/maybe None).
    """
    global _SLOPE_MASK
    date_str = ee.Date(date_ee).format('YYYY-MM-dd').getInfo()

    img_s1 = _load_s1_image(date_str, region)
    if img_s1 is None:
        return None, None, None

    if _SLOPE_MASK is None:
        _SLOPE_MASK = _build_slope_mask()

    hist = _get_histogram(img_s1, region)
    if hist is None:
        return None, None, None

    method, thr = _choose_threshold_from_hist(hist)

    if method == "all_water":
        water = ee.Image.constant(1).rename("water")
    elif method == "all_land":
        water = ee.Image.constant(0).rename("water")
    else:
        water = img_s1.lt(thr).rename("water")

    water_m = water.updateMask(_SLOPE_MASK)

    frac = (water_m.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=region,
        scale=S1_SCALE,
        bestEffort=True,
        maxPixels=1e13
    ).get("water"))

    try:
        frac_val = frac.getInfo()
    except Exception:
        return None, method, thr

    if frac_val is None:
        return None, method, thr

    frac_val = max(0.0, min(1.0, float(frac_val)))

    if DEBUG_MODE and PRINT_PER_ROW:
        thr_str = f"{thr:.2f}" if (thr is not None and np.isfinite(thr)) else "NA"
        tqdm.write(f"[DEBUG] {date_str} frac={frac_val:.4f} method={method} thr_db={thr_str}")

    return frac_val, method, thr

def process_one_csv(input_csv_path, output_csv_path):
    """Process a single day CSV; write output CSV (only rows with results)."""
    date_str = date_from_csv_path(input_csv_path)
    date_ee  = ee.Date(date_str)
    if DEBUG_MODE:
        tqdm.write(f"\n[DAY] {date_str} | modes={S1_ACCEPT_MODES}, pols={S1_POLS_PREFERENCE}, windows={S1_DAY_WINDOWS_TRY}")

    # read
    data = pd.read_csv(input_csv_path)
    base_cols = data.columns.tolist()

    # sample
    total_rows = len(data)
    if total_rows > 0 and 0 < SAMPLE_FRAC < 1:
        n = max(1, int(round(SAMPLE_FRAC * total_rows)))
        data = data.sample(n=n, random_state=SAMPLE_SEED, replace=False).sort_index()
        #tqdm.write(f"[SAMPLE] {os.path.basename(input_csv_path)} -> {len(data)}/{total_rows} rows (~{SAMPLE_FRAC*100:.2f}%)")

    results = []

    for _, row in tqdm(data.iterrows(), total=len(data), desc=f"Rows {os.path.basename(input_csv_path)}", leave=False):
        try:
            region_coords = [
                [float(row['c1_lon']), float(row['c1_lat'])],
                [float(row['c2_lon']), float(row['c2_lat'])],
                [float(row['c3_lon']), float(row['c3_lat'])],
                [float(row['c4_lon']), float(row['c4_lat'])],
                [float(row['c1_lon']), float(row['c1_lat'])],  # close polygon
            ]
        except (KeyError, TypeError, ValueError):
            continue

        region = ee.Geometry.Polygon([region_coords], None, False)

        frac, method, thr = calculate_water_fraction(region, date_ee)
        if frac is not None and np.isfinite(frac):
            if PRINT_PER_ROW:
                thr_str = f"{thr:.2f}" if (thr is not None and np.isfinite(thr)) else "NA"
                tqdm.write(f"{date_str} frac={frac:.4f}, method={method}, thr_db={thr_str} @ {region_coords[0]}")

            out_row = row.to_dict()
            out_row["water_fraction"] = float(frac)
            out_row["thr_method"]     = method
            out_row["thr_db"]         = float(thr) if (thr is not None and np.isfinite(thr)) else np.nan
            results.append(out_row)

    # save (only rows with results). If none, write empty file with headers.
    out_cols = base_cols + ["water_fraction", "thr_method", "thr_db"]
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    pd.DataFrame(results, columns=out_cols).to_csv(output_csv_path, index=False)

    if len(results) == 0:
        tqdm.write(f"No valid rows for {date_str}; wrote empty CSV: {output_csv_path}")
    else:
        tqdm.write(f"Saved {len(results)} rows -> {output_csv_path}")

def main():
    # gather all yyyyMMdd.csv in the month folder
    files = [f for f in os.listdir(INPUT_MONTH_DIR) if _YMD_RE.match(f)]
    files.sort()  # chronological order
    if not files:
        print(f"No daily CSVs found in {INPUT_MONTH_DIR}")
        return

    #tqdm.write(f"Daily CSVs in {os.path.basename(INPUT_MONTH_DIR)}: {len(files)} file(s)")
    for fname in (files):
        in_path  = os.path.join(INPUT_MONTH_DIR, fname)
        out_path = os.path.join(OUTPUT_MONTH_DIR, fname)

        if SKIP_IF_EXISTS and os.path.exists(out_path):
            tqdm.write(f"[SKIP] {fname} exists")
            continue

        try:
            process_one_csv(in_path, out_path)
        except Exception as e:
            tqdm.write(f"Failed to process {in_path}: {e}")
            # Optional: write an empty CSV with only headers if read succeeded earlier.
            # Here we just continue.

if __name__ == "__main__":
    main()

*** Earth Engine *** Share your feedback by taking our Annual Developer Satisfaction Survey: https://google.qualtrics.com/jfe/form/SV_7TDKVSyKvBdmMqW?ref=4i2o6
                                                                     

KeyboardInterrupt: 