This notebook builds an updated dataset that preserves all CHIRPS data (with gauge precipitation) and appends an additional precipitation variable (MSWEP)

In [1]:
from pathlib import Path
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
BACKUP_DIR = Path("./filtered_data_gauge_and_CHIRPS/time_series")
DATA_DIR   = Path("./filtered_data_gauge_CHIRPS_MSWEP/time_series")
UYPRECIP_DIR = Path("../MSWEP/filtered_data_MSWEP/time_series")

In [3]:
PRECIP_VAR = "prcp_mm_day"
GAUGE_VAR = "prcp_mswep_mm_day"  # new variable for gauge-corrected precipitation
TIME_VAR = "date"

DATA_DIR.mkdir(parents=True, exist_ok=True)

for nc_in in BACKUP_DIR.glob("CAMELS_UY_*.nc"):
    basin_id = nc_in.stem
    print(basin_id)
    nc_out = DATA_DIR / nc_in.name
    nc_file = UYPRECIP_DIR / f"{basin_id}.nc"

    print(f"\nProcessing basin: {basin_id}")

    # ---------- Load forcing ----------
    ds = xr.open_dataset(nc_in).load()

    if PRECIP_VAR not in ds or TIME_VAR not in ds.coords:
        print("  ⏭ Missing precip or time coord – copying unchanged")
        ds.to_netcdf(nc_out)
        ds.close()
        continue

    # ---------- Load gauge ----------
    if not nc_file.exists():
        print("  ⏭ No gauge data")
        ds.to_netcdf(nc_out)
        ds.close()
        continue

    df_gauge = xr.open_dataset(nc_file).load()

    # Only accept prcp_mm_day variable
    if "prcp_mm_day" not in df_gauge:
        print("  ⏭ Gauge data missing prcp_mm_day – skipping")
        ds.to_netcdf(nc_out)
        ds.close()
        continue

    # ---------- Align dates ----------
    ds_time = pd.to_datetime(ds[TIME_VAR].values)
    if TIME_VAR in df_gauge.coords:
        gauge_time = pd.to_datetime(df_gauge[TIME_VAR].values)
    else:
        gauge_time = pd.to_datetime(df_gauge.index.values)

    common_dates = np.intersect1d(ds_time, gauge_time)

    if len(common_dates) == 0:
        print("  ⏭ No overlapping dates")
        ds.to_netcdf(nc_out)
        ds.close()
        continue

    print(f"  ✔ Replacing precipitation for {len(common_dates)} days")

    # ---------- Add gauge precipitation ----------
    gauge_precip = xr.full_like(ds[PRECIP_VAR], np.nan)

    # Map dates to indices
    ds_indices = np.isin(ds_time, common_dates)
    if TIME_VAR in df_gauge.coords:
        gauge_values = df_gauge["prcp_mm_day"].values
    else:
        gauge_values = df_gauge["prcp_mm_day"].values

    gauge_indices = np.isin(gauge_time, common_dates)
    gauge_precip.values[ds_indices] = gauge_values[gauge_indices]

    ds[GAUGE_VAR] = gauge_precip
    # ds.attrs["precip_update"] = "Gauge data used where available"

    # ---------- Write ----------
    ds.to_netcdf(nc_out)
    ds.close()

print("\n✅ All basins processed successfully.")


CAMELS_UY_7

Processing basin: CAMELS_UY_7
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_2

Processing basin: CAMELS_UY_2
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_16

Processing basin: CAMELS_UY_16
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_9

Processing basin: CAMELS_UY_9
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_5

Processing basin: CAMELS_UY_5
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_11

Processing basin: CAMELS_UY_11
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_6

Processing basin: CAMELS_UY_6
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_8

Processing basin: CAMELS_UY_8
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_3

Processing basin: CAMELS_UY_3
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_15

Processing basin: CAMELS_UY_15
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_10

Processing basin: CAMELS_UY_10
  ✔ Replacing precipitation for 11322 days

✅ All basins processed successfully.


In [4]:
CAMELS_UY_10=xr.open_dataset(DATA_DIR / "CAMELS_UY_10.nc")
CAMELS_UY_10