In [None]:
import pandas as pd

# -------- FILE PATHS --------
MASTER_FILE = "Cleaned_Rainfall_Data_Java.xlsx"     
NEW_FILE    = "Cleaned_Rainfall_Jakarta.xlsx"       
OUTPUT_FILE = "Cleaned_Rainfall_Data_Java.xlsx"

# -------- LOAD BOTH --------
master = pd.read_excel(MASTER_FILE)
new    = pd.read_excel(NEW_FILE)

# -------- ENSURE CONSISTENT COLUMNS --------
# unify column naming (case-insensitive)
rename_map = {c: c.strip().capitalize() for c in master.columns}
master.rename(columns=rename_map, inplace=True)
new.rename(columns=rename_map, inplace=True)

# Add missing columns if any
for col in ["Province", "Region", "Year", "Month", "Date", "Avg_rain_mm"]:
    if col not in master.columns:
        master[col] = None
    if col not in new.columns:
        new[col] = None

# -------- CONVERT TYPES --------
for df in [master, new]:
    df["Province"] = df["Province"].astype(str).str.strip()
    df["Region"]   = df["Region"].astype(str).str.strip()
    df["Year"]     = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
    df["Month"]    = pd.to_numeric(df["Month"], errors="coerce").astype("Int64")
    df["Date"]     = pd.to_datetime(df["Date"], errors="coerce")
    df["Avg_rain_mm"] = pd.to_numeric(df["Avg_rain_mm"], errors="coerce")

# -------- COMBINE --------
combined = pd.concat([master, new], ignore_index=True)

# Drop exact duplicates (same Province, Region, Year, Month, Date)
combined.drop_duplicates(subset=["Province", "Region", "Year", "Month", "Date"], keep="first", inplace=True)

# Sort nicely
combined.sort_values(["Province", "Region", "Year", "Month", "Date"], inplace=True, ignore_index=True)

# -------- SAVE OUTPUT --------
combined.to_excel(OUTPUT_FILE, index=False)

print(f"✅ Merged dataset saved as: {OUTPUT_FILE}")
print(f"Total rows: {len(combined)}")
print("Provinces included:", sorted(combined['Province'].dropna().unique().tolist()))
print("Regions sample:", sorted(combined['Region'].dropna().unique().tolist())[:20])


✅ Merged dataset saved as: Cleaned_Rainfall_Data_Java.xlsx
Total rows: 22489
Provinces included: ['Banten', 'DKI Jakarta', 'Jawa Tengah', 'Jawa Timur']
Regions sample: ['Banjarnegara', 'Cilacap', 'Jakarta Pusat', 'Jakarta Timur', 'Jakarta Utara', 'KotSerang', 'KotTangerang', 'KotTangsel', 'Semarang', 'Tegal', 'banyuwangi', 'kabTangerang', 'malang', 'pasuruan', 'sidoarjo', 'surabaya']
