In [None]:
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import pandas as pd
from pathlib import Path
import os
import numpy as np
from concurrent.futures import ProcessPoolExecutor

# --- Paths ---
path_shapefiles = Path('../MSWEP/shapefiles')
chirps_folder = Path('/inputs/CHIRPS_v3/')
output_dir = Path('./precip_timeseries')
output_dir.mkdir(exist_ok=True)

# --- Mapping shapefile names to CAMELS_UY IDs ---
basins_mapping = {
    'paso_mazangano': 'CAMELS_UY_10',
    'picada_de_coelho': 'CAMELS_UY_7',
    'sarandi_del_yi': 'CAMELS_UY_12',
    'paso_de_las_toscas': 'CAMELS_UY_8',
    'paso_de_las_piedras_rn': 'CAMELS_UY_15',
    'paso_del_borracho': 'CAMELS_UY_6',
    'bequelo': 'CAMELS_UY_16',
    'paso_de_las_piedras': 'CAMELS_UY_2',
    'paso_baltasar': 'CAMELS_UY_5',
    'fraile_muerto': 'CAMELS_UY_11',
    'paso_de_los_mellizos': 'CAMELS_UY_14',
    'paso_manuel_diaz': 'CAMELS_UY_3',
    'paso_aguiar': 'CAMELS_UY_9',
    'paso_de_la_compania': 'CAMELS_UY_1',
    'tacuarembo': 'CAMELS_UY_4',
    'durazno': 'CAMELS_UY_13'
}

# --- List shapefiles and CHIRPS files ---
shapefiles = [f for f in os.listdir(path_shapefiles) if f.endswith('.zip')]
chirps_files = sorted(chirps_folder.glob("*.tif"))

# --- Function to process one file for a basin ---
def process_file(file_path, gdf):
    # extract date from filename
    date_str = file_path.stem.split('.')[-3:]
    date = pd.to_datetime('-'.join(date_str))
    
    with rasterio.open(file_path) as src:
        out_image, _ = mask(src, gdf.geometry, crop=True, filled=True, nodata=np.nan)
        precip = out_image[0]  # first band
        value = np.nanmean(precip)
    
    return date, value

# --- Loop over basins ---
for shp_file in shapefiles:
    basin_name = shp_file.replace('.zip', '')
    camels_id = basins_mapping[basin_name]

    print(f"Processing basin {basin_name} -> {camels_id}")

    # load shapefile
    gdf = gpd.read_file(path_shapefiles / shp_file).to_crs("EPSG:4326")

    # --- Parallel processing of daily files ---
    all_ts = []
    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_file, f, gdf) for f in chirps_files]
        for i, future in enumerate(futures, 1):
            date, value = future.result()
            all_ts.append((date, value))
            print(f"{basin_name} - {date.date()} processed ({i}/{len(chirps_files)})")
            
    # build dataframe
    full_ts = pd.Series(
        data=[v for _, v in all_ts],
        index=[d for d, _ in all_ts],
        name="precipitation"
    ).sort_index()

    # save CSV
    output_file = output_dir / f"{camels_id}_precip.csv"
    full_ts.to_csv(output_file)
    print(f"Saved {output_file}")


Processing basin durazno -> CAMELS_UY_13
durazno - 1989-01-01 processed (1/11322)
durazno - 1989-01-02 processed (2/11322)
durazno - 1989-01-03 processed (3/11322)
durazno - 1989-01-04 processed (4/11322)
durazno - 1989-01-05 processed (5/11322)
durazno - 1989-01-06 processed (6/11322)
durazno - 1989-01-07 processed (7/11322)
durazno - 1989-01-08 processed (8/11322)
durazno - 1989-01-09 processed (9/11322)
durazno - 1989-01-10 processed (10/11322)
durazno - 1989-01-11 processed (11/11322)
durazno - 1989-01-12 processed (12/11322)
durazno - 1989-01-13 processed (13/11322)
durazno - 1989-01-14 processed (14/11322)
durazno - 1989-01-15 processed (15/11322)
durazno - 1989-01-16 processed (16/11322)
durazno - 1989-01-17 processed (17/11322)
durazno - 1989-01-18 processed (18/11322)
durazno - 1989-01-19 processed (19/11322)
durazno - 1989-01-20 processed (20/11322)
durazno - 1989-01-21 processed (21/11322)
durazno - 1989-01-22 processed (22/11322)
durazno - 1989-01-23 processed (23/11322)
du