In [1]:
#!/usr/bin/env python3
# Filter CYGNSS daily CSVs to land-only using Natural Earth land polygons.

import os, re, sys
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

# ----------------- User paths -----------------
INPUT_ROOT  = r"/mnt/cephfs-mount/chenchen/CygnssDataCsv"
OUTPUT_ROOT = r"/mnt/cephfs-mount/chenchen/CygnssDataCsvLand"

# Only process these years; set to None to process all
YEARS_TO_PROCESS = {2018}   # e.g. {2018} or set(range(2019, 2022)) or None

# Skip Antarctica to avoid coastal shelf artifacts (set False to keep)
DROP_ANTARCTICA = True

# If True, still write header-only CSVs when no land points; if False, skip writing
WRITE_EMPTY_FILES = True

# Natural Earth Admin-0 countries shapefile (50m = better coasts than 110m).
# You can also point this to a local .zip on disk.
NE_ADMIN0_SOURCE = os.environ.get(
    "NE_ADMIN0_SOURCE",
    "https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_countries.zip"
)

# ------------------------------------------------
def safe_read_csv(path):
    """Robust CSV reader that handles weird delimiters/encodings."""
    try:
        return pd.read_csv(path, engine="c", low_memory=False)
    except Exception:
        pass
    for kwargs in (
        dict(engine="python", sep=None, low_memory=False),
        dict(engine="python", sep=None, on_bad_lines="skip", low_memory=False),
    ):
        try:
            return pd.read_csv(path, **kwargs)
        except Exception:
            pass
    # last-resort encodings
    for enc in ("utf-8", "latin1", "utf-16", "utf-8-sig"):
        try:
            return pd.read_csv(path, engine="python", sep=None, encoding=enc, low_memory=False)
        except Exception:
            continue
    raise RuntimeError(f"Unable to read CSV: {path}")

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def load_land_gdf():
    """
    Load land polygons using Natural Earth Admin-0 countries.
    Drops Antarctica if requested, then keeps only the geometry column.
    Works with GeoPandas >=1.0 (no deprecated datasets API).
    """
    # Read from URL or local .zip path
    countries = gpd.read_file(NE_ADMIN0_SOURCE)

    # Drop Antarctica if requested (robust to different name columns)
    if DROP_ANTARCTICA:
        name_cols = ["NAME_EN", "NAME", "ADMIN", "name_en", "name", "NAME_LONG", "name_long"]
        ant_col = next((c for c in name_cols if c in countries.columns), None)
        if ant_col is not None:
            countries = countries[countries[ant_col] != "Antarctica"]
        else:
            # Fallback: filter by latitude (remove features extending below ~60°S)
            b = countries.bounds  # DataFrame with minx, miny, maxx, maxy
            countries = countries[b["miny"] > -60]

    # Keep WGS84 and only geometry (no dissolve needed; sjoin handles multiple polygons)
    land = countries.to_crs("EPSG:4326")[["geometry"]].reset_index(drop=True)

    # Build spatial index once (speeds up sjoin on large batches)
    _ = land.sindex
    return land


def sjoin_points_on_land(points_gdf, land_gdf):
    """Spatial join for points within land polygons (handles older geopandas API too)."""
    try:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", predicate="within")
    except TypeError:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", op="within")
    # We only need the point indices
    return hits.index.unique()

def process_csv(in_csv, out_csv, land_gdf):
    df = safe_read_csv(in_csv)

    # Require columns
    if not {'sp_lon','sp_lat'}.issubset(df.columns):
        print(f"[WARN] Missing sp_lon/sp_lat in {in_csv}; copying header only.")
        ensure_dir(os.path.dirname(out_csv))
        df.head(0).to_csv(out_csv, index=False)
        return 0

    # Coerce numeric and drop NaNs
    df['sp_lon'] = pd.to_numeric(df['sp_lon'], errors='coerce')
    df['sp_lat'] = pd.to_numeric(df['sp_lat'], errors='coerce')
    df2 = df.dropna(subset=['sp_lon','sp_lat'])

    if df2.empty:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Build point GeoDataFrame in WGS84
    points = gpd.GeoDataFrame(
        df2[['sp_lon','sp_lat']],
        geometry=gpd.points_from_xy(df2['sp_lon'], df2['sp_lat']),
        crs="EPSG:4326"
    )

    keep_idx = sjoin_points_on_land(points, land_gdf)

    if len(keep_idx) == 0:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Map back to original rows (preserve original column order/types)
    df_out = df.loc[keep_idx]
    ensure_dir(os.path.dirname(out_csv))
    df_out.to_csv(out_csv, index=False)
    return len(df_out)

def main():
    land_gdf = load_land_gdf()

    # Find folders like YYYY_MM, optionally filtered by year
    month_dirs = []
    for d in sorted(os.listdir(INPUT_ROOT)):
        p = os.path.join(INPUT_ROOT, d)
        if not (os.path.isdir(p) and re.fullmatch(r"\d{4}_\d{2}", d)):
            continue
        year = int(d[:4])
        if YEARS_TO_PROCESS is not None and year not in YEARS_TO_PROCESS:
            continue
        month_dirs.append(d)


    total_in, total_kept = 0, 0

    for month in month_dirs:
        in_month  = os.path.join(INPUT_ROOT,  month)
        out_month = os.path.join(OUTPUT_ROOT, month)
        ensure_dir(out_month)

        csvs = [f for f in sorted(os.listdir(in_month)) if f.lower().endswith(".csv")]
        if not csvs:
            continue

        print(f"\nProcessing {month}: {len(csvs)} files")
        for fname in tqdm(csvs, unit="file"):
            in_csv  = os.path.join(in_month, fname)
            out_csv = os.path.join(out_month, fname)
            try:
                # Count rows before filtering (for a quick summary)
                try:
                    n_before = sum(1 for _ in open(in_csv, 'r', encoding='utf-8', errors='ignore')) - 1
                    if n_before < 0: n_before = 0
                except Exception:
                    n_before = 0

                kept = process_csv(in_csv, out_csv, land_gdf)
                total_in   += max(n_before, kept)  # best-effort accounting
                total_kept += kept
            except Exception as e:
                print(f"[ERROR] {in_csv}: {e}", file=sys.stderr)
                continue

    print(f"\nDone. Estimated input rows: {total_in:,}; kept (land) rows: {total_kept:,}.")
    print(f"Outputs saved under: {OUTPUT_ROOT}")

if __name__ == "__main__":
    main()



Processing 2018_08: 31 files


100%|██████████████████████████████████████████| 31/31 [17:32<00:00, 33.96s/file]



Processing 2018_09: 30 files


100%|██████████████████████████████████████████| 30/30 [17:50<00:00, 35.70s/file]



Processing 2018_10: 31 files


100%|██████████████████████████████████████████| 31/31 [19:30<00:00, 37.75s/file]



Processing 2018_11: 30 files


100%|██████████████████████████████████████████| 30/30 [18:45<00:00, 37.50s/file]



Processing 2018_12: 31 files


100%|██████████████████████████████████████████| 31/31 [20:22<00:00, 39.42s/file]


Done. Estimated input rows: 403,164,992; kept (land) rows: 108,421,130.
Outputs saved under: /mnt/cephfs-mount/chenchen/CygnssDataCsvLand





In [3]:
#!/usr/bin/env python3
# Filter CYGNSS daily CSVs to land-only using Natural Earth land polygons.

import os, re, sys
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

# ----------------- User paths -----------------
INPUT_ROOT  = r"/mnt/cephfs-mount/chenchen/CygnssDataCsv"
OUTPUT_ROOT = r"/mnt/cephfs-mount/chenchen/CygnssDataCsvLand"

# Only process these years; set to None to process all
YEARS_TO_PROCESS = {2019,2020,2021,2022,2023}   # e.g. {2019,2020,2021,2022,2023} or set(range(2019, 2022)) or None

# Skip Antarctica to avoid coastal shelf artifacts (set False to keep)
DROP_ANTARCTICA = True

# If True, still write header-only CSVs when no land points; if False, skip writing
WRITE_EMPTY_FILES = True

# Natural Earth Admin-0 countries shapefile (50m = better coasts than 110m).
# You can also point this to a local .zip on disk.
NE_ADMIN0_SOURCE = os.environ.get(
    "NE_ADMIN0_SOURCE",
    "https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_countries.zip"
)

# ------------------------------------------------
def safe_read_csv(path):
    """Robust CSV reader that handles weird delimiters/encodings."""
    try:
        return pd.read_csv(path, engine="c", low_memory=False)
    except Exception:
        pass
    for kwargs in (
        dict(engine="python", sep=None, low_memory=False),
        dict(engine="python", sep=None, on_bad_lines="skip", low_memory=False),
    ):
        try:
            return pd.read_csv(path, **kwargs)
        except Exception:
            pass
    # last-resort encodings
    for enc in ("utf-8", "latin1", "utf-16", "utf-8-sig"):
        try:
            return pd.read_csv(path, engine="python", sep=None, encoding=enc, low_memory=False)
        except Exception:
            continue
    raise RuntimeError(f"Unable to read CSV: {path}")

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def load_land_gdf():
    """
    Load land polygons using Natural Earth Admin-0 countries.
    Drops Antarctica if requested, then keeps only the geometry column.
    Works with GeoPandas >=1.0 (no deprecated datasets API).
    """
    # Read from URL or local .zip path
    countries = gpd.read_file(NE_ADMIN0_SOURCE)

    # Drop Antarctica if requested (robust to different name columns)
    if DROP_ANTARCTICA:
        name_cols = ["NAME_EN", "NAME", "ADMIN", "name_en", "name", "NAME_LONG", "name_long"]
        ant_col = next((c for c in name_cols if c in countries.columns), None)
        if ant_col is not None:
            countries = countries[countries[ant_col] != "Antarctica"]
        else:
            # Fallback: filter by latitude (remove features extending below ~60°S)
            b = countries.bounds  # DataFrame with minx, miny, maxx, maxy
            countries = countries[b["miny"] > -60]

    # Keep WGS84 and only geometry (no dissolve needed; sjoin handles multiple polygons)
    land = countries.to_crs("EPSG:4326")[["geometry"]].reset_index(drop=True)

    # Build spatial index once (speeds up sjoin on large batches)
    _ = land.sindex
    return land


def sjoin_points_on_land(points_gdf, land_gdf):
    """Spatial join for points within land polygons (handles older geopandas API too)."""
    try:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", predicate="within")
    except TypeError:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", op="within")
    # We only need the point indices
    return hits.index.unique()

def process_csv(in_csv, out_csv, land_gdf):
    df = safe_read_csv(in_csv)

    # Require columns
    if not {'sp_lon','sp_lat'}.issubset(df.columns):
        print(f"[WARN] Missing sp_lon/sp_lat in {in_csv}; copying header only.")
        ensure_dir(os.path.dirname(out_csv))
        df.head(0).to_csv(out_csv, index=False)
        return 0

    # Coerce numeric and drop NaNs
    df['sp_lon'] = pd.to_numeric(df['sp_lon'], errors='coerce')
    df['sp_lat'] = pd.to_numeric(df['sp_lat'], errors='coerce')
    df2 = df.dropna(subset=['sp_lon','sp_lat'])

    if df2.empty:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Build point GeoDataFrame in WGS84
    points = gpd.GeoDataFrame(
        df2[['sp_lon','sp_lat']],
        geometry=gpd.points_from_xy(df2['sp_lon'], df2['sp_lat']),
        crs="EPSG:4326"
    )

    keep_idx = sjoin_points_on_land(points, land_gdf)

    if len(keep_idx) == 0:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Map back to original rows (preserve original column order/types)
    df_out = df.loc[keep_idx]
    ensure_dir(os.path.dirname(out_csv))
    df_out.to_csv(out_csv, index=False)
    return len(df_out)

def main():
    land_gdf = load_land_gdf()

    # Find folders like YYYY_MM, optionally filtered by year
    month_dirs = []
    for d in sorted(os.listdir(INPUT_ROOT)):
        p = os.path.join(INPUT_ROOT, d)
        if not (os.path.isdir(p) and re.fullmatch(r"\d{4}_\d{2}", d)):
            continue
        year = int(d[:4])
        if YEARS_TO_PROCESS is not None and year not in YEARS_TO_PROCESS:
            continue
        month_dirs.append(d)


    total_in, total_kept = 0, 0

    for month in month_dirs:
        in_month  = os.path.join(INPUT_ROOT,  month)
        out_month = os.path.join(OUTPUT_ROOT, month)
        ensure_dir(out_month)

        csvs = [f for f in sorted(os.listdir(in_month)) if f.lower().endswith(".csv")]
        if not csvs:
            continue

        print(f"\nProcessing {month}: {len(csvs)} files")
        for fname in tqdm(csvs, unit="file"):
            in_csv  = os.path.join(in_month, fname)
            out_csv = os.path.join(out_month, fname)
            try:
                # Count rows before filtering (for a quick summary)
                try:
                    n_before = sum(1 for _ in open(in_csv, 'r', encoding='utf-8', errors='ignore')) - 1
                    if n_before < 0: n_before = 0
                except Exception:
                    n_before = 0

                kept = process_csv(in_csv, out_csv, land_gdf)
                total_in   += max(n_before, kept)  # best-effort accounting
                total_kept += kept
            except Exception as e:
                print(f"[ERROR] {in_csv}: {e}", file=sys.stderr)
                continue

    print(f"\nDone. Estimated input rows: {total_in:,}; kept (land) rows: {total_kept:,}.")
    print(f"Outputs saved under: {OUTPUT_ROOT}")

if __name__ == "__main__":
    main()



Processing 2019_01: 31 files


100%|██████████████████████████████████████████| 31/31 [18:44<00:00, 36.26s/file]



Processing 2019_02: 28 files


100%|██████████████████████████████████████████| 28/28 [16:42<00:00, 35.79s/file]



Processing 2019_03: 31 files


100%|██████████████████████████████████████████| 31/31 [18:39<00:00, 36.13s/file]



Processing 2019_04: 30 files


100%|██████████████████████████████████████████| 30/30 [18:20<00:00, 36.67s/file]



Processing 2019_05: 31 files


100%|██████████████████████████████████████████| 31/31 [19:29<00:00, 37.71s/file]



Processing 2019_06: 30 files


100%|██████████████████████████████████████████| 30/30 [19:47<00:00, 39.59s/file]



Processing 2019_07: 31 files


100%|██████████████████████████████████████████| 31/31 [43:01<00:00, 83.27s/file]



Processing 2019_08: 31 files


100%|██████████████████████████████████████████| 31/31 [44:12<00:00, 85.55s/file]



Processing 2019_09: 30 files


100%|██████████████████████████████████████████| 30/30 [41:56<00:00, 83.87s/file]



Processing 2019_10: 31 files


100%|██████████████████████████████████████████| 31/31 [42:38<00:00, 82.53s/file]



Processing 2019_11: 30 files


100%|██████████████████████████████████████████| 30/30 [42:17<00:00, 84.59s/file]



Processing 2019_12: 31 files


100%|██████████████████████████████████████████| 31/31 [41:58<00:00, 81.23s/file]



Processing 2020_01: 31 files


100%|██████████████████████████████████████████| 31/31 [38:42<00:00, 74.93s/file]



Processing 2020_02: 29 files


100%|██████████████████████████████████████████| 29/29 [34:48<00:00, 72.01s/file]



Processing 2020_03: 31 files


100%|██████████████████████████████████████████| 31/31 [38:49<00:00, 75.15s/file]



Processing 2020_04: 30 files


100%|██████████████████████████████████████████| 30/30 [40:41<00:00, 81.37s/file]



Processing 2020_05: 31 files


100%|██████████████████████████████████████████| 31/31 [42:53<00:00, 83.01s/file]



Processing 2020_06: 30 files


100%|██████████████████████████████████████████| 30/30 [40:40<00:00, 81.36s/file]



Processing 2020_07: 31 files


100%|██████████████████████████████████████████| 31/31 [39:01<00:00, 75.54s/file]



Processing 2020_08: 31 files


100%|██████████████████████████████████████████| 31/31 [39:54<00:00, 77.26s/file]



Processing 2020_09: 30 files


100%|██████████████████████████████████████████| 30/30 [40:59<00:00, 81.97s/file]



Processing 2020_10: 31 files


100%|██████████████████████████████████████████| 31/31 [40:45<00:00, 78.89s/file]



Processing 2020_11: 30 files


100%|██████████████████████████████████████████| 30/30 [39:39<00:00, 79.32s/file]



Processing 2020_12: 31 files


100%|██████████████████████████████████████████| 31/31 [38:11<00:00, 73.91s/file]



Processing 2021_01: 31 files


100%|██████████████████████████████████████████| 31/31 [39:04<00:00, 75.63s/file]



Processing 2021_02: 28 files


100%|██████████████████████████████████████████| 28/28 [36:45<00:00, 78.78s/file]



Processing 2021_03: 31 files


100%|██████████████████████████████████████████| 31/31 [40:13<00:00, 77.86s/file]



Processing 2021_04: 30 files


100%|██████████████████████████████████████████| 30/30 [36:36<00:00, 73.21s/file]



Processing 2021_05: 31 files


100%|██████████████████████████████████████████| 31/31 [37:28<00:00, 72.52s/file]



Processing 2021_06: 30 files


100%|██████████████████████████████████████████| 30/30 [36:45<00:00, 73.53s/file]



Processing 2021_07: 31 files


100%|██████████████████████████████████████████| 31/31 [38:48<00:00, 75.10s/file]



Processing 2021_08: 31 files


100%|██████████████████████████████████████████| 31/31 [39:10<00:00, 75.81s/file]



Processing 2021_09: 30 files


100%|██████████████████████████████████████████| 30/30 [44:36<00:00, 89.23s/file]



Processing 2021_10: 31 files


100%|██████████████████████████████████████████| 31/31 [51:38<00:00, 99.95s/file]



Processing 2021_11: 30 files


100%|██████████████████████████████████████████| 30/30 [36:35<00:00, 73.18s/file]



Processing 2021_12: 31 files


100%|██████████████████████████████████████████| 31/31 [40:13<00:00, 77.85s/file]



Processing 2022_01: 31 files


100%|██████████████████████████████████████████| 31/31 [37:11<00:00, 72.00s/file]



Processing 2022_02: 28 files


100%|██████████████████████████████████████████| 28/28 [36:37<00:00, 78.49s/file]



Processing 2022_03: 31 files


100%|██████████████████████████████████████████| 31/31 [39:13<00:00, 75.93s/file]



Processing 2022_04: 30 files


100%|██████████████████████████████████████████| 30/30 [36:48<00:00, 73.60s/file]



Processing 2022_05: 31 files


100%|██████████████████████████████████████████| 31/31 [37:57<00:00, 73.47s/file]



Processing 2022_06: 30 files


100%|██████████████████████████████████████████| 30/30 [38:14<00:00, 76.47s/file]



Processing 2022_07: 31 files


100%|██████████████████████████████████████████| 31/31 [39:27<00:00, 76.37s/file]



Processing 2022_08: 31 files


100%|██████████████████████████████████████████| 31/31 [42:01<00:00, 81.33s/file]



Processing 2022_09: 30 files


100%|██████████████████████████████████████████| 30/30 [43:36<00:00, 87.23s/file]



Processing 2022_10: 31 files


100%|██████████████████████████████████████████| 31/31 [43:36<00:00, 84.39s/file]



Processing 2022_11: 30 files


100%|██████████████████████████████████████████| 30/30 [39:46<00:00, 79.56s/file]



Processing 2022_12: 31 files


100%|██████████████████████████████████████████| 31/31 [34:45<00:00, 67.27s/file]



Processing 2023_01: 31 files


100%|██████████████████████████████████████████| 31/31 [34:09<00:00, 66.10s/file]



Processing 2023_02: 28 files


100%|██████████████████████████████████████████| 28/28 [29:57<00:00, 64.19s/file]



Processing 2023_03: 31 files


100%|██████████████████████████████████████████| 31/31 [35:02<00:00, 67.83s/file]



Processing 2023_04: 30 files


100%|██████████████████████████████████████████| 30/30 [33:23<00:00, 66.78s/file]



Processing 2023_05: 31 files


100%|██████████████████████████████████████████| 31/31 [35:21<00:00, 68.45s/file]



Processing 2023_06: 30 files


100%|██████████████████████████████████████████| 30/30 [35:19<00:00, 70.66s/file]



Processing 2023_07: 31 files


100%|██████████████████████████████████████████| 31/31 [35:22<00:00, 68.48s/file]



Processing 2023_08: 31 files


100%|██████████████████████████████████████████| 31/31 [38:59<00:00, 75.46s/file]



Processing 2023_09: 30 files


100%|██████████████████████████████████████████| 30/30 [36:03<00:00, 72.13s/file]



Processing 2023_10: 31 files


100%|██████████████████████████████████████████| 31/31 [34:22<00:00, 66.53s/file]



Processing 2023_11: 30 files


100%|██████████████████████████████████████████| 30/30 [35:17<00:00, 70.60s/file]



Processing 2023_12: 31 files


100%|██████████████████████████████████████████| 31/31 [35:29<00:00, 68.70s/file]


Done. Estimated input rows: 9,213,776,708; kept (land) rows: 2,482,978,703.
Outputs saved under: /mnt/cephfs-mount/chenchen/CygnssDataCsvLand





In [1]:
#!/usr/bin/env python3
# Filter CYGNSS daily CSVs to land-only using Natural Earth land polygons.

import os, re, sys
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

# ----------------- User paths -----------------
INPUT_ROOT  = r"/mnt/cephfs-mount/chenchen/CygnssDataCsv"
OUTPUT_ROOT = r"/mnt/cephfs-mount/chenchen/CygnssDataCsvLand"

# Only process these years; set to None to process all
YEARS_TO_PROCESS = {2025}   # e.g. {2019,2020,2021,2022,2023} or set(range(2019, 2022)) or None

# Skip Antarctica to avoid coastal shelf artifacts (set False to keep)
DROP_ANTARCTICA = True

# If True, still write header-only CSVs when no land points; if False, skip writing
WRITE_EMPTY_FILES = True

# Natural Earth Admin-0 countries shapefile (50m = better coasts than 110m).
# You can also point this to a local .zip on disk.
NE_ADMIN0_SOURCE = os.environ.get(
    "NE_ADMIN0_SOURCE",
    "https://naturalearth.s3.amazonaws.com/50m_cultural/ne_50m_admin_0_countries.zip"
)

# ------------------------------------------------
def safe_read_csv(path):
    """Robust CSV reader that handles weird delimiters/encodings."""
    try:
        return pd.read_csv(path, engine="c", low_memory=False)
    except Exception:
        pass
    for kwargs in (
        dict(engine="python", sep=None, low_memory=False),
        dict(engine="python", sep=None, on_bad_lines="skip", low_memory=False),
    ):
        try:
            return pd.read_csv(path, **kwargs)
        except Exception:
            pass
    # last-resort encodings
    for enc in ("utf-8", "latin1", "utf-16", "utf-8-sig"):
        try:
            return pd.read_csv(path, engine="python", sep=None, encoding=enc, low_memory=False)
        except Exception:
            continue
    raise RuntimeError(f"Unable to read CSV: {path}")

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def load_land_gdf():
    """
    Load land polygons using Natural Earth Admin-0 countries.
    Drops Antarctica if requested, then keeps only the geometry column.
    Works with GeoPandas >=1.0 (no deprecated datasets API).
    """
    # Read from URL or local .zip path
    countries = gpd.read_file(NE_ADMIN0_SOURCE)

    # Drop Antarctica if requested (robust to different name columns)
    if DROP_ANTARCTICA:
        name_cols = ["NAME_EN", "NAME", "ADMIN", "name_en", "name", "NAME_LONG", "name_long"]
        ant_col = next((c for c in name_cols if c in countries.columns), None)
        if ant_col is not None:
            countries = countries[countries[ant_col] != "Antarctica"]
        else:
            # Fallback: filter by latitude (remove features extending below ~60°S)
            b = countries.bounds  # DataFrame with minx, miny, maxx, maxy
            countries = countries[b["miny"] > -60]

    # Keep WGS84 and only geometry (no dissolve needed; sjoin handles multiple polygons)
    land = countries.to_crs("EPSG:4326")[["geometry"]].reset_index(drop=True)

    # Build spatial index once (speeds up sjoin on large batches)
    _ = land.sindex
    return land


def sjoin_points_on_land(points_gdf, land_gdf):
    """Spatial join for points within land polygons (handles older geopandas API too)."""
    try:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", predicate="within")
    except TypeError:
        hits = gpd.sjoin(points_gdf, land_gdf, how="inner", op="within")
    # We only need the point indices
    return hits.index.unique()

def process_csv(in_csv, out_csv, land_gdf):
    df = safe_read_csv(in_csv)

    # Require columns
    if not {'sp_lon','sp_lat'}.issubset(df.columns):
        print(f"[WARN] Missing sp_lon/sp_lat in {in_csv}; copying header only.")
        ensure_dir(os.path.dirname(out_csv))
        df.head(0).to_csv(out_csv, index=False)
        return 0

    # Coerce numeric and drop NaNs
    df['sp_lon'] = pd.to_numeric(df['sp_lon'], errors='coerce')
    df['sp_lat'] = pd.to_numeric(df['sp_lat'], errors='coerce')
    df2 = df.dropna(subset=['sp_lon','sp_lat'])

    if df2.empty:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Build point GeoDataFrame in WGS84
    points = gpd.GeoDataFrame(
        df2[['sp_lon','sp_lat']],
        geometry=gpd.points_from_xy(df2['sp_lon'], df2['sp_lat']),
        crs="EPSG:4326"
    )

    keep_idx = sjoin_points_on_land(points, land_gdf)

    if len(keep_idx) == 0:
        if WRITE_EMPTY_FILES:
            ensure_dir(os.path.dirname(out_csv))
            df.head(0).to_csv(out_csv, index=False)
        return 0

    # Map back to original rows (preserve original column order/types)
    df_out = df.loc[keep_idx]
    ensure_dir(os.path.dirname(out_csv))
    df_out.to_csv(out_csv, index=False)
    return len(df_out)

def main():
    land_gdf = load_land_gdf()

    # Find folders like YYYY_MM, optionally filtered by year
    month_dirs = []
    for d in sorted(os.listdir(INPUT_ROOT)):
        p = os.path.join(INPUT_ROOT, d)
        if not (os.path.isdir(p) and re.fullmatch(r"\d{4}_\d{2}", d)):
            continue
        year = int(d[:4])
        if YEARS_TO_PROCESS is not None and year not in YEARS_TO_PROCESS:
            continue
        month_dirs.append(d)


    total_in, total_kept = 0, 0

    for month in month_dirs:
        in_month  = os.path.join(INPUT_ROOT,  month)
        out_month = os.path.join(OUTPUT_ROOT, month)
        ensure_dir(out_month)

        csvs = [f for f in sorted(os.listdir(in_month)) if f.lower().endswith(".csv")]
        if not csvs:
            continue

        print(f"\nProcessing {month}: {len(csvs)} files")
        for fname in tqdm(csvs, unit="file"):
            in_csv  = os.path.join(in_month, fname)
            out_csv = os.path.join(out_month, fname)
            try:
                # Count rows before filtering (for a quick summary)
                try:
                    n_before = sum(1 for _ in open(in_csv, 'r', encoding='utf-8', errors='ignore')) - 1
                    if n_before < 0: n_before = 0
                except Exception:
                    n_before = 0

                kept = process_csv(in_csv, out_csv, land_gdf)
                total_in   += max(n_before, kept)  # best-effort accounting
                total_kept += kept
            except Exception as e:
                print(f"[ERROR] {in_csv}: {e}", file=sys.stderr)
                continue

    print(f"\nDone. Estimated input rows: {total_in:,}; kept (land) rows: {total_kept:,}.")
    print(f"Outputs saved under: {OUTPUT_ROOT}")

if __name__ == "__main__":
    main()



Processing 2025_01: 31 files


100%|██████████████████████████████████████████| 31/31 [31:09<00:00, 60.30s/file]



Processing 2025_02: 28 files


100%|██████████████████████████████████████████| 28/28 [32:31<00:00, 69.71s/file]



Processing 2025_03: 31 files


100%|██████████████████████████████████████████| 31/31 [37:40<00:00, 72.93s/file]



Processing 2025_04: 30 files


100%|██████████████████████████████████████████| 30/30 [29:17<00:00, 58.58s/file]



Processing 2025_05: 31 files


100%|██████████████████████████████████████████| 31/31 [30:31<00:00, 59.07s/file]



Processing 2025_06: 30 files


100%|██████████████████████████████████████████| 30/30 [28:16<00:00, 56.55s/file]



Processing 2025_07: 31 files


100%|██████████████████████████████████████████| 31/31 [29:07<00:00, 56.39s/file]


Done. Estimated input rows: 998,920,888; kept (land) rows: 266,596,848.
Outputs saved under: /mnt/cephfs-mount/chenchen/CygnssDataCsvLand



