In [3]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

INPUT_DIR   = Path("Jakarta Curah Hujan")   # <- your folder
OUTPUT_FILE = "Cleaned_Rainfall_Jakarta.xlsx"
PROVINCE    = "DKI Jakarta"

# explicit month tokens (Ind/Eng)
MON_RE = r"(jan|feb|mar|apr|mei|may|jun|jul|agu|ags|aug|sep|okt|oct|nov|des|dec)"
MONTH_MAP = {
    "jan":1,"feb":2,"mar":3,"apr":4,"mei":5,"may":5,"jun":6,"jul":7,
    "agu":8,"ags":8,"aug":8,"sep":9,"okt":10,"oct":10,"nov":11,"des":12,"dec":12
}

# region normalization (shortcodes + common variants)
def normalize_region(token: str) -> str:
    t = re.sub(r'[^a-z]', '', token.lower())
    # collapse accidental repeats (e.g., 'jjapus' -> 'japus')
    t = re.sub(r'(.)\1+', r'\1', t)
    mapping = {
        "jakut":"Jakarta Utara",
        "jakbar":"Jakarta Barat",
        "jaksel":"Jakarta Selatan",
        "jaktim":"Jakarta Timur",
        "jatim":"Jakarta Timur",     # your filenames use this
        "jakpus":"Jakarta Pusat",
        "japus":"Jakarta Pusat",     # seen in your list
    }
    return mapping.get(t, token)  # fallback to original if not recognized

def parse_from_filename(fname: str):
    """
    Match: ^YYYY (month-token) (region-letters) (.xlsx|.xls)$
    e.g. 2020AgsJakut.xlsx, 2024OktJJapus.xlsx
    """
    stem = Path(fname).stem
    s = re.sub(r'[^A-Za-z0-9]+', '', stem)  # remove spaces/_/dashes/dots

    m = re.match(rf"^((19|20)\d{{2}}){MON_RE}([A-Za-z]+)$", s, flags=re.IGNORECASE)
    if not m:
        return None, None, None

    year  = int(m.group(1))
    mon_k = m.group(3).lower()
    month = MONTH_MAP.get(mon_k)
    region_raw = m.group(4)

    # normalize region
    region = normalize_region(region_raw)
    return year, month, region

def extract_bmkg_table(path: Path, year_hint=None, month_hint=None, region_hint=None):
    """
    - finds header row containing 'TANGGAL'
    - keeps TANGGAL + RR* columns
    - coerces to numeric; '-' 8888 9999 -> NaN
    - computes Avg_Rain_mm across RR cols
    """
    try:
        raw = pd.read_excel(path, header=None)
    except Exception as e:
        print(f"⚠️ Cannot read: {path} ({e})")
        return pd.DataFrame(columns=["Province","Region","Year","Month","Date","Avg_Rain_mm"])

    hdr = raw.index[
        raw.apply(lambda r: r.astype(str).str.contains(r"\bTANGGAL\b", case=False, na=False)).any(axis=1)
    ]
    if len(hdr) == 0:
        print(f"⚠️ No 'TANGGAL' header in: {path}")
        return pd.DataFrame(columns=["Province","Region","Year","Month","Date","Avg_Rain_mm"])

    start = int(hdr[0])
    df = pd.read_excel(path, header=start)
    df.columns = df.columns.astype(str).str.strip()
    if "TANGGAL" not in df.columns:
        df.rename(columns={df.columns[0]:"TANGGAL"}, inplace=True)

    keep = [c for c in df.columns if c == "TANGGAL" or "RR" in c]
    df = df[keep].copy()

    for c in df.columns:
        if c != "TANGGAL":
            df[c] = pd.to_numeric(df[c].replace({"-": np.nan}), errors="coerce").replace({8888: np.nan, 9999: np.nan})

    rr = [c for c in df.columns if c != "TANGGAL"]
    if not rr:
        return pd.DataFrame(columns=["Province","Region","Year","Month","Date","Avg_Rain_mm"])

    df.rename(columns={"TANGGAL":"Date"}, inplace=True)
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
    df["Avg_Rain_mm"] = df[rr].mean(axis=1, skipna=True)

    out = df.dropna(subset=["Date"]).copy()
    out["Province"] = PROVINCE
    out["Region"]   = region_hint
    out["Year"]     = year_hint
    out["Month"]    = month_hint
    return out[["Province","Region","Year","Month","Date","Avg_Rain_mm"]]

# -------- run over folder --------
excel_files = list(INPUT_DIR.glob("*.xlsx")) + list(INPUT_DIR.glob("*.xls"))
if not excel_files:
    raise SystemExit(f"No Excel files in {INPUT_DIR.resolve()}")

parts = []
skipped = []
for f in excel_files:
    y, m, r = parse_from_filename(f.name)
    if (y is None) or (m is None) or (r is None):
        skipped.append(f.name)
        continue
    dfp = extract_bmkg_table(f, y, m, r)
    if not dfp.empty:
        parts.append(dfp)

# Report parse issues (first 20)
if skipped:
    print("⚠️ Could not parse these filenames (showing up to 20):")
    for nm in skipped[:20]:
        print("  -", nm)

merged = (pd.concat(parts, ignore_index=True)
          if parts else pd.DataFrame(columns=["Province","Region","Year","Month","Date","Avg_Rain_mm"]))

# final tidy + safety
merged["Year"]  = pd.to_numeric(merged["Year"], errors="coerce").astype("Int64")
merged["Month"] = pd.to_numeric(merged["Month"], errors="coerce").astype("Int64")
merged["Date"]  = pd.to_datetime(merged["Date"], errors="coerce")
merged["Region"]   = merged["Region"].astype(str).str.strip()
merged["Province"] = merged["Province"].astype(str).str.strip()
merged = merged[~merged["Region"].str.fullmatch(r"\d+")].copy()

merged = merged.sort_values(["Province","Region","Year","Month","Date"]).reset_index(drop=True)
merged.to_excel(OUTPUT_FILE, index=False)

print(f"✅ Saved merged dataset: {OUTPUT_FILE}")
print("Rows:", len(merged))
print("Years:", sorted(merged['Year'].dropna().unique().tolist()))
print("Regions:", sorted(merged['Region'].dropna().unique().tolist()))
print(merged.head())


  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date"], errors="coerce", dayfirst=True)
  df["Date"] = pd.to_datetime(df["Date

✅ Saved merged dataset: Cleaned_Rainfall_Jakarta.xlsx
Rows: 3870
Years: [2020, 2021, 2022, 2023, 2024]
Regions: ['Jakarta Pusat', 'Jakarta Timur', 'Jakarta Utara']
      Province         Region  Year  Month       Date  Avg_Rain_mm
0  DKI Jakarta  Jakarta Pusat  2020      1 2020-01-01        145.3
1  DKI Jakarta  Jakarta Pusat  2020      1 2020-01-02         58.8
2  DKI Jakarta  Jakarta Pusat  2020      1 2020-01-03          NaN
3  DKI Jakarta  Jakarta Pusat  2020      1 2020-01-04          2.5
4  DKI Jakarta  Jakarta Pusat  2020      1 2020-01-05          0.2
