This notebook builds an updated dataset that preserves all CARAVAN data (with gauge precipitation) and appends an additional precipitation variable based on MSWEP precipitation.

In [1]:
from pathlib import Path
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
BACKUP_DIR = Path("../preparing_data/cleaned_filtered_data_gauge_precip/time_series")
DATA_DIR   = Path("./filtered_data_gauge_and_CHIRPS/time_series")
UYPRECIP_DIR = Path("./precip_timeseries")

In [3]:
PRECIP_VAR = "prcp_mm_day"
GAUGE_VAR = "prcp_chirps_mm_day"     # new variable for gauge-corrected precipitation
TIME_VAR = "date"

DATA_DIR.mkdir(parents=True, exist_ok=True)

for nc_in in BACKUP_DIR.glob("CAMELS_UY_*.nc"):
    basin_id = nc_in.stem
    print(basin_id)
    nc_out = DATA_DIR / nc_in.name
    csv_file = UYPRECIP_DIR / f"{basin_id}_precip.csv"

    print(f"\nProcessing basin: {basin_id}")

    # ---------- Load forcing ----------
    ds = xr.open_dataset(nc_in).load()

    if PRECIP_VAR not in ds or TIME_VAR not in ds.coords:
        print("  ⏭ Missing precip or time coord – copying unchanged")
        ds.to_netcdf(nc_out)
        ds.close()
        continue

    # ---------- Load gauge ----------
    if not csv_file.exists():
        print("  ⏭ No gauge data")
        # ds.to_netcdf(nc_out)
        # ds.close()
        continue

    # df_gauge = (
    #     pd.read_csv(csv_file, parse_dates=[TIME_VAR])
    #     .set_index(TIME_VAR)
    # )

    df_gauge = (
        pd.read_csv(csv_file, parse_dates=["time"])
        .rename(columns={"time": TIME_VAR})
        .set_index(TIME_VAR)
    )

    # ---------- Align dates ----------
    ds_time = pd.to_datetime(ds[TIME_VAR].values)
    common_dates = ds_time.intersection(df_gauge.index)

    if len(common_dates) == 0:
        print("  ⏭ No overlapping dates")
        # ds.to_netcdf(nc_out)
        # ds.close()
        continue

    print(f"  ✔ Replacing precipitation for {len(common_dates)} days")

    # ---------- Add gauge precipitation ----------
    # Initialize new variable with NaNs
    gauge_precip = xr.full_like(ds[PRECIP_VAR], np.nan)
    # Fill in common dates with gauge values
    gauge_precip.loc[{TIME_VAR: common_dates}] = df_gauge.loc[common_dates, "precipitation"].values

    # Add to dataset
    ds[GAUGE_VAR] = gauge_precip

    ds.attrs["precip_update"] = "Gauge data used where available"

    # ---------- Write ----------
    ds.to_netcdf(nc_out)
    ds.close()

print("\n✅ All basins processed successfully.")

CAMELS_UY_7

Processing basin: CAMELS_UY_7
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_2

Processing basin: CAMELS_UY_2
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_16

Processing basin: CAMELS_UY_16
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_9

Processing basin: CAMELS_UY_9
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_5

Processing basin: CAMELS_UY_5
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_11

Processing basin: CAMELS_UY_11
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_6

Processing basin: CAMELS_UY_6
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_8

Processing basin: CAMELS_UY_8
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_3

Processing basin: CAMELS_UY_3
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_15

Processing basin: CAMELS_UY_15
  ✔ Replacing precipitation for 11322 days
CAMELS_UY_10

Processing basin: CAMELS_UY_10
  ✔ Replacing precipitation for 11322 days

✅ All basins processed successfully.


In [4]:
# DATA_DIR = Path("../preparing_data/filtered_data/time_series")


In [5]:
CAMELS_UY_10=xr.open_dataset(DATA_DIR / "CAMELS_UY_10.nc")
CAMELS_UY_10