In [1]:
import pandas as pd
import numpy as np
import glob

# your 2022–2023 files
new_files = glob.glob("./Rain_Data_Banten/Jan*.xlsx") 

def extract_rainfall_data(path):
    df = pd.read_excel(path, header=None)
    hdr = df.index[df.apply(lambda r: r.astype(str).str.contains('TANGGAL', case=False, na=False)).any(axis=1)]
    if not len(hdr): return pd.DataFrame()
    start = hdr[0]
    df = pd.read_excel(path, header=start)
    df.columns = df.columns.astype(str).str.strip()
    if "TANGGAL" not in df.columns:
        df.rename(columns={df.columns[0]: "TANGGAL"}, inplace=True)
    rain_cols = [c for c in df.columns if "RR" in c or "TANGGAL" in c]
    df = df[rain_cols]
    for c in df.columns:
        if c != "TANGGAL":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    df["Avg_Rain_mm"] = df.drop(columns=["TANGGAL"]).mean(axis=1, skipna=True)
    df["Year"] = int(''.join([ch for ch in path if ch.isdigit()])[:4])
    df["Region"] = path.split("Jan")[1].replace(str(df["Year"].iloc[0]), "").replace(".xlsx","").strip()
    df.rename(columns={"TANGGAL":"Date"}, inplace=True)
    return df[["Region","Year","Date","Avg_Rain_mm"]]

# existing cleaned dataset
base = pd.read_excel("Cleaned_Rainfall_Data_Java.xlsx")

# new data
new_dfs = [extract_rainfall_data(f) for f in new_files]
new_data = pd.concat(new_dfs, ignore_index=True)

# merge and clean
all_data = pd.concat([base, new_data], ignore_index=True)
all_data["Avg_Rain_mm"].replace([8888,9999], np.nan, inplace=True)

# save
all_data.to_excel("Cleaned_Rainfall_Data_Java.xlsx", index=False)
print("✅ Merged dataset saved as Cleaned_Rainfall_Data_Java.xlsx")


✅ Merged dataset saved as Cleaned_Rainfall_Data_Java.xlsx
