In [2]:
import re
from pathlib import Path
import pandas as pd

# ========= CONFIG (edit if needed) =========
IN_DIR = Path("/mnt/cephfs-mount/chenchen/water_body_fraction_with_landcover&slope/2019_08")
OUT_CSV = "/mnt/cephfs-mount/chenchen/water_body_fraction_with_landcover&slope/monthly/2019_08.csv"
# ===========================================

def parse_from_filename(name: str):
    """
    Extract YYYYMMDD from filename and return:
      date_iso (YYYY-MM-DD), julian_day (1..366), julian_date (astronomical JD, float)
    """
    m = re.search(r"(\d{8})", name)
    if not m:
        return None, None, None
    ymd = m.group(1)
    ts = pd.to_datetime(ymd, format="%Y%m%d", errors="coerce")
    if pd.isna(ts):
        return None, None, None
    return ts.date().isoformat(), int(ts.dayofyear), float(pd.Timestamp(ts).to_julian_date())

# Gather daily CSVs only (YYYYMMDD.csv); ignore merge_summary.csv and others
all_csvs = sorted(IN_DIR.glob("*.csv"))
daily_csvs = [p for p in all_csvs if re.fullmatch(r"\d{8}\.csv", p.name)]

if not daily_csvs:
    raise SystemExit(f"[ABORT] No daily CSVs found in: {IN_DIR}")

frames = []
for p in daily_csvs:
    try:
        df = pd.read_csv(p, low_memory=False)
    except Exception as e:
        print(f"[SKIP] Failed to read {p.name}: {e}")
        continue

    date_iso, jday, jdate = parse_from_filename(p.name)
    if date_iso is None:
        print(f"[WARN] Could not parse date from filename: {p.name}")
        continue

    # Add time columns at the front for convenience
    df.insert(0, "date", date_iso)
    df.insert(1, "julian_day", jday)
    df.insert(2, "julian_date", jdate)

    frames.append(df)
    print(f"[OK] {p.name} → date={date_iso}, julian_day={jday}, rows={len(df)}")

# Concatenate
merged = pd.concat(frames, ignore_index=True, sort=False)

# Sort by common keys if present
sort_cols = [c for c in ["date", "sample", "ddm"] if c in merged.columns]
if sort_cols:
    merged = merged.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

# Write output
merged.to_csv(OUT_CSV, index=False)
print(f"\n✅ Merged {len(daily_csvs)} files → {OUT_CSV}")
print(f"Total rows: {len(merged):,} | Columns: {len(merged.columns)}")
print("First 6 columns:", merged.columns[:6].tolist())

[OK] 20190801.csv → date=2019-08-01, julian_day=213, rows=1990
[OK] 20190802.csv → date=2019-08-02, julian_day=214, rows=2173
[OK] 20190803.csv → date=2019-08-03, julian_day=215, rows=2469
[OK] 20190804.csv → date=2019-08-04, julian_day=216, rows=2229
[OK] 20190805.csv → date=2019-08-05, julian_day=217, rows=2403
[OK] 20190806.csv → date=2019-08-06, julian_day=218, rows=2050
[OK] 20190807.csv → date=2019-08-07, julian_day=219, rows=2329
[OK] 20190808.csv → date=2019-08-08, julian_day=220, rows=2531
[OK] 20190809.csv → date=2019-08-09, julian_day=221, rows=2352
[OK] 20190810.csv → date=2019-08-10, julian_day=222, rows=2589
[OK] 20190811.csv → date=2019-08-11, julian_day=223, rows=2030
[OK] 20190812.csv → date=2019-08-12, julian_day=224, rows=2361
[OK] 20190813.csv → date=2019-08-13, julian_day=225, rows=2347
[OK] 20190814.csv → date=2019-08-14, julian_day=226, rows=2206
[OK] 20190815.csv → date=2019-08-15, julian_day=227, rows=2506
[OK] 20190816.csv → date=2019-08-16, julian_day=228, ro

In [7]:
# -*- coding: utf-8 -*-
import re
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
ROOT_DIR = Path("/mnt/cephfs-mount/chenchen/water_body_fraction_with_landcover&slope/")
OUT_DIR  = ROOT_DIR / "monthly"
OUT_DIR.mkdir(exist_ok=True)
# ===========================

def parse_from_filename(name: str):
    """
    Extract YYYYMMDD from filename and return:
      date_iso (YYYY-MM-DD), julian_day (1..366), julian_date (astronomical JD, float)
    """
    m = re.search(r"(\d{8})", name)
    if not m:
        return None, None, None
    ymd = m.group(1)
    ts = pd.to_datetime(ymd, format="%Y%m%d", errors="coerce")
    if pd.isna(ts):
        return None, None, None
    return ts.date().isoformat(), int(ts.dayofyear), float(pd.Timestamp(ts).to_julian_date())

# ============ LOOP ============
for year in range(2018, 2026):         # 2018 → 2025 inclusive
    for month in range(1, 13):
        ym = f"{year}_{month:02d}"
        in_dir = ROOT_DIR / ym
        if not in_dir.exists():
            print(f"[SKIP] {ym}: folder does not exist")
            continue

        # Gather daily CSVs
        all_csvs   = sorted(in_dir.glob("*.csv"))
        daily_csvs = [p for p in all_csvs if re.fullmatch(r"\d{8}\.csv", p.name)]

        if not daily_csvs:
            print(f"[SKIP] {ym}: no daily CSVs")
            continue

        frames = []
        for p in daily_csvs:
            try:
                df = pd.read_csv(p, low_memory=False)
            except Exception as e:
                print(f"[SKIP] Failed to read {p.name}: {e}")
                continue

            date_iso, jday, jdate = parse_from_filename(p.name)
            if date_iso is None:
                print(f"[WARN] Could not parse date from filename: {p.name}")
                continue

            # Add time columns
            df.insert(0, "date", date_iso)
            df.insert(1, "julian_day", jday)
            df.insert(2, "julian_date", jdate)

            frames.append(df)
            print(f"[OK] {ym}/{p.name} → date={date_iso}, rows={len(df)}")

        if not frames:
            print(f"[WARN] {ym}: no valid data loaded")
            continue

        merged = pd.concat(frames, ignore_index=True, sort=False)

        # Sort if possible
        sort_cols = [c for c in ["date", "sample", "ddm"] if c in merged.columns]
        if sort_cols:
            merged = merged.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

        out_csv = OUT_DIR / f"{ym}.csv"
        merged.to_csv(out_csv, index=False)

        print(f"\n✅ Merged {len(daily_csvs)} files → {out_csv}")
        print(f"Total rows: {len(merged):,} | Columns: {len(merged.columns)}")
        print("First 6 columns:", merged.columns[:6].tolist(), "\n")

[SKIP] 2018_01: folder does not exist
[SKIP] 2018_02: folder does not exist
[SKIP] 2018_03: folder does not exist
[SKIP] 2018_04: folder does not exist
[SKIP] 2018_05: folder does not exist
[SKIP] 2018_06: folder does not exist
[SKIP] 2018_07: folder does not exist
[OK] 2018_08/20180801.csv → date=2018-08-01, rows=930
[OK] 2018_08/20180802.csv → date=2018-08-02, rows=892
[OK] 2018_08/20180803.csv → date=2018-08-03, rows=894
[OK] 2018_08/20180804.csv → date=2018-08-04, rows=760
[OK] 2018_08/20180805.csv → date=2018-08-05, rows=808
[OK] 2018_08/20180806.csv → date=2018-08-06, rows=813
[OK] 2018_08/20180807.csv → date=2018-08-07, rows=788
[OK] 2018_08/20180808.csv → date=2018-08-08, rows=915
[OK] 2018_08/20180809.csv → date=2018-08-09, rows=890
[OK] 2018_08/20180810.csv → date=2018-08-10, rows=859
[OK] 2018_08/20180811.csv → date=2018-08-11, rows=927
[OK] 2018_08/20180812.csv → date=2018-08-12, rows=864
[OK] 2018_08/20180813.csv → date=2018-08-13, rows=832
[OK] 2018_08/20180814.csv → date

In [8]:
import re
from pathlib import Path
import pandas as pd

# ============ CONFIG ============
IN_DIR = Path("/mnt/cephfs-mount/chenchen/water_body_fraction_with_landcover&slope/monthly")
OUT_CSV = "/mnt/cephfs-mount/chenchen/water_body_fraction_with_landcover&slope/2018to2025_water_fraction.csv"
# =================================

def parse_from_filename(name: str):
    """
    Extract YYYY_MM from filename and return (year_month, datetime).
    """
    m = re.fullmatch(r"(\d{4})_(\d{2})\.csv", name)
    if not m:
        return None, None
    year, month = int(m.group(1)), int(m.group(2))
    ts = pd.Timestamp(year=year, month=month, day=1)
    return f"{year}-{month:02d}", ts

# Gather monthly CSVs
all_csvs = sorted(IN_DIR.glob("*.csv"))
monthly_csvs = [p for p in all_csvs if re.fullmatch(r"\d{4}_\d{2}\.csv", p.name)]

if not monthly_csvs:
    raise SystemExit(f"[ABORT] No monthly CSVs found in: {IN_DIR}")

frames = []
for p in monthly_csvs:
    try:
        df = pd.read_csv(p, low_memory=False)
    except Exception as e:
        print(f"[SKIP] Failed to read {p.name}: {e}")
        continue

    ym_str, ts = parse_from_filename(p.name)
    if ym_str is None:
        print(f"[WARN] Could not parse year_month from filename: {p.name}")
        continue

    # Add metadata columns
    df.insert(0, "year_month", ym_str)
    df.insert(1, "datetime", ts)

    frames.append(df)
    print(f"[OK] {p.name} → year_month={ym_str}, rows={len(df)}")

# Concatenate
merged = pd.concat(frames, ignore_index=True, sort=False)

# Sort by datetime if present
if "datetime" in merged.columns:
    merged = merged.sort_values("datetime", kind="mergesort").reset_index(drop=True)

# Write output
merged.to_csv(OUT_CSV, index=False)
print(f"\n✅ Merged {len(monthly_csvs)} files → {OUT_CSV}")
print(f"Total rows: {len(merged):,} | Columns: {len(merged.columns)}")
print("First 6 columns:", merged.columns[:6].tolist())

[OK] 2018_08.csv → year_month=2018-08, rows=29769
[OK] 2018_09.csv → year_month=2018-09, rows=30511
[OK] 2018_10.csv → year_month=2018-10, rows=34702
[OK] 2018_11.csv → year_month=2018-11, rows=34078
[OK] 2018_12.csv → year_month=2018-12, rows=35393
[OK] 2019_01.csv → year_month=2019-01, rows=34451
[OK] 2019_02.csv → year_month=2019-02, rows=30600
[OK] 2019_03.csv → year_month=2019-03, rows=35586
[OK] 2019_04.csv → year_month=2019-04, rows=33954
[OK] 2019_05.csv → year_month=2019-05, rows=34622
[OK] 2019_06.csv → year_month=2019-06, rows=35260
[OK] 2019_07.csv → year_month=2019-07, rows=68737
[OK] 2019_08.csv → year_month=2019-08, rows=71300
[OK] 2019_09.csv → year_month=2019-09, rows=69650
[OK] 2019_10.csv → year_month=2019-10, rows=73445
[OK] 2019_11.csv → year_month=2019-11, rows=73707
[OK] 2019_12.csv → year_month=2019-12, rows=75462
[OK] 2020_01.csv → year_month=2020-01, rows=74468
[OK] 2020_02.csv → year_month=2020-02, rows=67506
[OK] 2020_03.csv → year_month=2020-03, rows=73743
