This notebook aggregates CARAVAN precipitation over multiple periods (7, 14, and 30 days).

In [1]:
import os
import pandas as pd
import xarray as xr
from pathlib import Path
import matplotlib.pyplot as plt

# 7D aggregation (left)

In [2]:
# -------- Paths --------

basins_file = "basins.txt"
input_dir   = "../preparing_data/filtered_data/time_series"
output_dir  = "./7D_aggregation/time_series"

os.makedirs(output_dir, exist_ok=True)

In [3]:
start_date = pd.Timestamp("1989-01-01")
end_date   = pd.Timestamp("2008-09-30")

In [4]:
vars_sum = ["prcp_mm_day"]
vars_mean = [
    "QObs_mm_d",
    "tmax_C",
    "tmin_C",
    "srad_W_m2",
]

In [5]:
with open(basins_file, "r") as f:
    basins = [line.strip() for line in f if line.strip()]

In [6]:
# for basin in basins:
#     in_file  = os.path.join(input_dir, f"{basin}.nc")
#     out_file = os.path.join(output_dir, f"{basin}.nc")

#     if not os.path.exists(in_file):
#         print(f"⚠️ File not found, skipping: {in_file}")
#         continue

#     print(f"Processing basin: {basin}")

#     caravan = xr.open_dataset(in_file)

#     # Select period
#     caravan = caravan.sel(date=slice(start_date, end_date))

#     # Select variables
#     ds = caravan[vars_sum + vars_mean]

#     # 7-day aggregation
#     ds_7d = xr.Dataset()

#     # Sum variables
#     ds_7d["prcp_mm_day"] = (
#         ds["prcp_mm_day"]
#         .resample(date="7D", label="left", closed="left")
#         .sum()
#     )

#     # Mean variables
#     for v in vars_mean:
#         ds_7d[v] = (
#             ds[v]
#             .resample(date="7D", label="left", closed="left")
#             .mean()
#         )

#     # Drop incomplete windows
#     ds_7d = ds_7d.dropna(dim="date")

#     # Save
#     ds_7d.to_netcdf(out_file)

#     caravan.close()

# print("✅ All basins processed.")

In [7]:
for basin in basins:

    in_file  = os.path.join(input_dir, f"{basin}.nc")
    out_file = os.path.join(output_dir, f"{basin}.nc")

    if not os.path.exists(in_file):
        print(f"⚠️ File not found, skipping: {in_file}")
        continue

    print(f"Processing basin: {basin}")

    caravan = xr.open_dataset(in_file)

    caravan = caravan.sel(date=slice(start_date, end_date))
    ds = caravan[vars_sum + vars_mean]

    # ---------------------------
    # 7-day aggregation (SAFE)
    # ---------------------------

    ds_7d = xr.Dataset()

    # sum variables
    ds_7d["prcp_mm_day"] = (
        ds["prcp_mm_day"]
        .coarsen(date=7, boundary="trim")
        .sum()
    )

    # mean variables
    for v in vars_mean:
        ds_7d[v] = (
            ds[v]
            .coarsen(date=7, boundary="trim")
            .mean()
        )

    # ---------------------------------
    # Assign timestamp to LAST day
    # ---------------------------------

    original_dates = ds.date.values

    new_dates = original_dates[6::7]   # last day of each 7-day block

    ds_7d = ds_7d.assign_coords(date=new_dates)

    # Save
    ds_7d.to_netcdf(out_file)

    caravan.close()

print("✅ All basins processed.")


Processing basin: CAMELS_UY_2
Processing basin: CAMELS_UY_3
Processing basin: CAMELS_UY_5
Processing basin: CAMELS_UY_6
Processing basin: CAMELS_UY_7
Processing basin: CAMELS_UY_8
Processing basin: CAMELS_UY_9
Processing basin: CAMELS_UY_10
Processing basin: CAMELS_UY_11
Processing basin: CAMELS_UY_15
Processing basin: CAMELS_UY_16
✅ All basins processed.


In [8]:
camels_2 = xr.open_dataset('./7D_aggregation/time_series/CAMELS_UY_2.nc')
camels_2

In [9]:
camels_2.close()

# 14D aggregation

In [10]:
# -------- Paths --------

basins_file = "basins.txt"
input_dir   = "../preparing_data/filtered_data/time_series"
output_dir  = "./14D_aggregation/time_series"

os.makedirs(output_dir, exist_ok=True)

In [11]:
start_date = pd.Timestamp("1989-01-01")
end_date   = pd.Timestamp("2008-09-30")

In [12]:
vars_sum = ["prcp_mm_day"]
vars_mean = [
    "QObs_mm_d",
    "tmax_C",
    "tmin_C",
    "srad_W_m2",
]

In [13]:
with open(basins_file, "r") as f:
    basins = [line.strip() for line in f if line.strip()]

In [14]:
# for basin in basins:
#     in_file  = os.path.join(input_dir, f"{basin}.nc")
#     out_file = os.path.join(output_dir, f"{basin}.nc")

#     if not os.path.exists(in_file):
#         print(f"⚠️ File not found, skipping: {in_file}")
#         continue

#     print(f"Processing basin: {basin}")

#     caravan = xr.open_dataset(in_file)

#     # Select period
#     caravan = caravan.sel(date=slice(start_date, end_date))

#     # Select variables
#     ds = caravan[vars_sum + vars_mean]

#     # 7-day aggregation
#     ds_7d = xr.Dataset()

#     # Sum variables
#     ds_7d["prcp_mm_day"] = (
#         ds["prcp_mm_day"]
#         .resample(date="14D", label="left", closed="left")
#         .sum()
#     )

#     # Mean variables
#     for v in vars_mean:
#         ds_7d[v] = (
#             ds[v]
#             .resample(date="14D", label="left", closed="left")
#             .mean()
#         )

#     # Drop incomplete windows
#     ds_7d = ds_7d.dropna(dim="date")

#     # Save
#     ds_7d.to_netcdf(out_file)

#     caravan.close()

# print("✅ All basins processed.")

In [15]:
for basin in basins:

    in_file  = os.path.join(input_dir, f"{basin}.nc")
    out_file = os.path.join(output_dir, f"{basin}.nc")

    if not os.path.exists(in_file):
        print(f"⚠️ File not found, skipping: {in_file}")
        continue

    print(f"Processing basin: {basin}")

    caravan = xr.open_dataset(in_file)

    caravan = caravan.sel(date=slice(start_date, end_date))
    ds = caravan[vars_sum + vars_mean]

    # ---------------------------
    # 7-day aggregation (SAFE)
    # ---------------------------

    ds_7d = xr.Dataset()

    # sum variables
    ds_7d["prcp_mm_day"] = (
        ds["prcp_mm_day"]
        .coarsen(date=14, boundary="trim")
        .sum()
    )

    # mean variables
    for v in vars_mean:
        ds_7d[v] = (
            ds[v]
            .coarsen(date=14, boundary="trim")
            .mean()
        )

    # ---------------------------------
    # Assign timestamp to LAST day
    # ---------------------------------

    original_dates = ds.date.values

    new_dates = original_dates[13::14]   # last day of each 7-day block

    ds_7d = ds_7d.assign_coords(date=new_dates)

    # Save
    ds_7d.to_netcdf(out_file)

    caravan.close()

print("✅ All basins processed.")


Processing basin: CAMELS_UY_2
Processing basin: CAMELS_UY_3
Processing basin: CAMELS_UY_5
Processing basin: CAMELS_UY_6
Processing basin: CAMELS_UY_7
Processing basin: CAMELS_UY_8
Processing basin: CAMELS_UY_9
Processing basin: CAMELS_UY_10
Processing basin: CAMELS_UY_11
Processing basin: CAMELS_UY_15
Processing basin: CAMELS_UY_16
✅ All basins processed.


In [16]:
camels_3 = xr.open_dataset('./14D_aggregation/time_series/CAMELS_UY_3.nc')
camels_3

In [17]:
camels_3.close()

# 30D aggregation (left)

In [26]:
# -------- Paths --------

basins_file = "basins.txt"
input_dir   = "../preparing_data/filtered_data/time_series"
output_dir  = "./30D_aggregation/time_series"

os.makedirs(output_dir, exist_ok=True)


In [27]:
start_date = pd.Timestamp("1989-01-01")
end_date   = pd.Timestamp("2008-09-30")

In [28]:
vars_sum = ["prcp_mm_day"]
vars_mean = [
    "QObs_mm_d",
    "tmax_C",
    "tmin_C",
    "srad_W_m2",
]

In [29]:
with open(basins_file, "r") as f:
    basins = [line.strip() for line in f if line.strip()]

In [30]:
# for basin in basins:
#     in_file  = os.path.join(input_dir, f"{basin}.nc")
#     out_file = os.path.join(output_dir, f"{basin}.nc")

#     if not os.path.exists(in_file):
#         print(f"⚠️ File not found, skipping: {in_file}")
#         continue

#     print(f"Processing basin: {basin}")

#     caravan = xr.open_dataset(in_file)

#     # Select period
#     caravan = caravan.sel(date=slice(start_date, end_date))

#     # Select variables
#     ds = caravan[vars_sum + vars_mean]

#     # 7-day aggregation
#     ds_7d = xr.Dataset()

#     # Sum variables
#     ds_7d["prcp_mm_day"] = (
#         ds["prcp_mm_day"]
#         .resample(date="30D", label="left", closed="left")
#         .sum()
#     )

#     # Mean variables
#     for v in vars_mean:
#         ds_7d[v] = (
#             ds[v]
#             .resample(date="30D", label="left", closed="left")
#             .mean()
#         )

#     # Drop incomplete windows
#     ds_7d = ds_7d.dropna(dim="date")

#     # Save
#     ds_7d.to_netcdf(out_file)

#     caravan.close()

# print("✅ All basins processed.")

In [31]:
for basin in basins:

    in_file  = os.path.join(input_dir, f"{basin}.nc")
    out_file = os.path.join(output_dir, f"{basin}.nc")

    if not os.path.exists(in_file):
        print(f"⚠️ File not found, skipping: {in_file}")
        continue

    print(f"Processing basin: {basin}")

    caravan = xr.open_dataset(in_file)

    caravan = caravan.sel(date=slice(start_date, end_date))
    ds = caravan[vars_sum + vars_mean]

    # ---------------------------
    # 7-day aggregation (SAFE)
    # ---------------------------

    ds_7d = xr.Dataset()

    # sum variables
    ds_7d["prcp_mm_day"] = (
        ds["prcp_mm_day"]
        .coarsen(date=30, boundary="trim")
        .sum()
    )

    # mean variables
    for v in vars_mean:
        ds_7d[v] = (
            ds[v]
            .coarsen(date=30, boundary="trim")
            .mean()
        )

    # ---------------------------------
    # Assign timestamp to LAST day
    # ---------------------------------

    original_dates = ds.date.values

    new_dates = original_dates[29::30]   # last day of each 7-day block

    ds_7d = ds_7d.assign_coords(date=new_dates)

    # Save
    ds_7d.to_netcdf(out_file)

    caravan.close()

print("✅ All basins processed.")


Processing basin: CAMELS_UY_2
Processing basin: CAMELS_UY_3
Processing basin: CAMELS_UY_5
Processing basin: CAMELS_UY_6
Processing basin: CAMELS_UY_7
Processing basin: CAMELS_UY_8
Processing basin: CAMELS_UY_9
Processing basin: CAMELS_UY_10
Processing basin: CAMELS_UY_11
Processing basin: CAMELS_UY_15
Processing basin: CAMELS_UY_16
✅ All basins processed.


In [32]:
camels_5 =xr.open_dataset(Path("./30D_aggregation/time_series/CAMELS_UY_5.nc"))
camels_5

In [33]:
camels_5.close()