In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.data.caravanify import Caravanify, CaravanifyConfig
from hydro_forecasting.data.caravanify_parquet import CaravanifyParquet, CaravanifyParquetConfig
from hydro_forecasting.data.preprocessing import check_data_quality

import time
import pandas as pd
import numpy as np
from tqdm import tqdm


---

In [3]:
config = CaravanifyConfig(
    attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/timeseries/csv",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan = Caravanify(config)

ids_for_training = caravan.get_all_gauge_ids()

In [4]:
config_parquet = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_parquet = CaravanifyParquet(config_parquet)

In [5]:
def create_basin_index(gauge_ids: list[str], caravan_instance, input_length=10, output_length=5):
    # Load and prepare data
    start_time = time.time()
    caravan_instance.load_stations(gauge_ids)
    ts_data = caravan_instance.get_time_series()
    print(f"Loading data: {time.time() - start_time:.2f}s")
    
    # Apply quality checks
    quality_start = time.time()
    quality_check_result = check_data_quality(
        df=ts_data,
        required_columns=["date", "streamflow", "total_precipitation_sum"],
        max_missing_pct=5,
        max_gap_length=30,
        min_train_years=5,
        max_imputation_gap_size=5,
        group_identifier="gauge_id",
        train_prop=0.7,
        val_prop=0.15,
        test_prop=0.15,
    )
    valid_data, quality_report = quality_check_result.unwrap()
    print(f"Quality check: {time.time() - quality_start:.2f}s")
    
    # Create index entries - optimized approach
    index_start = time.time()
    all_index_entries = []
    total_seq_length = input_length + output_length
    cols_to_check = ["streamflow", "total_precipitation_sum"]
    
    # Group once by gauge_id - huge performance improvement
    basin_groups = valid_data.groupby("gauge_id")

    for gauge_id, basin_data in tqdm(basin_groups, desc="Processing basins"):
        if len(basin_data) < total_seq_length:
            continue

        # Extract needed data as arrays
        basin_values = basin_data[cols_to_check].to_numpy()
        dates = basin_data["date"].to_numpy()

        # Combined valid mask: 1 if all cols not NaN, 0 otherwise
        combined_valid = (~np.isnan(basin_values).any(axis=1)).astype(int)

        # Convolve to find valid input sequences
        input_conv = np.convolve(combined_valid, np.ones(input_length, dtype=int), mode="valid")
        input_valid = input_conv == input_length

        # Convolve for output sequences, shifted by input_length
        output_conv = np.convolve(combined_valid, np.ones(output_length, dtype=int), mode="valid")
        output_valid = output_conv == output_length
        output_valid_shifted = np.pad(output_valid, (input_length, 0), constant_values=False)[:len(input_valid)]

        # Find valid sequence starts
        valid_mask = input_valid & output_valid_shifted
        valid_positions = np.where(valid_mask)[0]

        # Create index entries
        entries = [{
            "file_path": f"path/to/timeseries/{gauge_id}.csv",
            "static_file_path": "path/to/static_attributes.csv",
            "gauge_id": gauge_id,
            "start_idx": idx,
            "end_idx": idx + total_seq_length,
            "input_end_date": dates[idx + input_length - 1],
            "valid_sequence": True,
        } for idx in valid_positions if idx + total_seq_length <= len(basin_data)]

        all_index_entries.extend(entries)
    
    print(f"Index creation: {time.time() - index_start:.2f}s")
    print(f"Total time: {time.time() - start_time:.2f}s")
    print(f"Created {len(all_index_entries)} valid sequences")

    return all_index_entries


In [6]:
# index_entries = create_basin_index(ids_for_training, caravan)

In [None]:
index_entries = create_basin_index(ids_for_training, caravan_parquet)

Loading data: 5.34s
