In [11]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


In [12]:
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import pyarrow.parquet as pq

from hydro_forecasting.data.caravanify import Caravanify, CaravanifyConfig
from hydro_forecasting.data.caravanify_parquet import CaravanifyParquet, CaravanifyParquetConfig

---

In [15]:
config = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
    # human_influence_path="/Users/cooper/Desktop/CAMELS-CH/src/human_influence_index/results/human_influence_classification.csv",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan = CaravanifyParquet(config)
basins = caravan.get_all_gauge_ids()[:20]

caravan.load_stations(basins)

static = caravan.get_static_attributes()
static["ele_mt_sav"]

0     276.198974
1     103.274219
2     174.744464
3     303.634479
4     378.958111
5     644.379447
6     626.000000
7     548.306359
8     282.385672
9      43.392693
10    369.000000
11     62.022910
12    187.342696
13    140.423908
14    527.042664
15    739.598304
16    422.189310
17    463.621079
18    456.324693
19    450.489605
Name: ele_mt_sav, dtype: float64

---

In [None]:
config = CaravanifyConfig(
    attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
    # human_influence_path="/Users/cooper/Desktop/CAMELS-CH/src/human_influence_index/results/human_influence_classification.csv",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan = Caravanify(config)
basins = caravan.get_all_gauge_ids()

In [None]:
def load_gauge_parquet(gauge_ids: list[str], base_dir: Path) -> pd.DataFrame:
    """
    Loads the .parquet file for a given list of gauge_ids.

    Args:
        gauge_ids (list[str]): Gauge IDs with the 'USA_' prefix.
        base_dir (Path): Path to the directory containing the parquet files.

    Returns:
        pd.DataFrame: Combined data from the corresponding parquet files.
    """
    data = []

    for gauge_id in gauge_ids:
        file_path = base_dir / f"{gauge_id}.parquet"
        if not file_path.exists():
            raise FileNotFoundError(
                f"No parquet file found for gauge ID {gauge_id} at {file_path}"
            )
        try:
            df = pd.read_parquet(file_path)
            df["gauge_id"] = gauge_id  # Assign here
            data.append(df)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

    combined_data = pd.concat(data, ignore_index=True)
    return combined_data


In [None]:
def create_basin_index(
    gauge_ids: list[str],
    base_dir: Path,
    static_file_path: Path,
    input_length=70,
    output_length=10,
):
    # Load and prepare data
    start_time = time.time()
    valid_data = load_gauge_parquet(gauge_ids, base_dir)
    print(f"Loading data: {time.time() - start_time:.2f}s")

    # Create index entries - optimized approach
    index_start = time.time()
    all_index_entries = []
    total_seq_length = input_length + output_length
    cols_to_check = ["streamflow", "total_precipitation_sum"]

    # Group once by gauge_id
    basin_groups = valid_data.groupby("gauge_id")

    for gauge_id, basin_data in tqdm(basin_groups, desc="Processing basins"):
        if len(basin_data) < total_seq_length:
            continue

        # Create actual file path for this gauge
        ts_file_path = base_dir / f"{gauge_id}.parquet"

        # Extract needed data as arrays
        basin_values = basin_data[cols_to_check].to_numpy()
        dates = basin_data["date"].to_numpy()

        # Combined valid mask: 1 if all cols not NaN, 0 otherwise
        combined_valid = (~np.isnan(basin_values).any(axis=1)).astype(int)

        # Convolve to find valid input sequences
        input_conv = np.convolve(
            combined_valid, np.ones(input_length, dtype=int), mode="valid"
        )
        input_valid = input_conv == input_length

        # Convolve for output sequences, shifted by input_length
        output_conv = np.convolve(
            combined_valid, np.ones(output_length, dtype=int), mode="valid"
        )
        output_valid = output_conv == output_length
        output_valid_shifted = np.pad(
            output_valid, (input_length, 0), constant_values=False
        )[: len(input_valid)]

        # Find valid sequence starts
        valid_mask = input_valid & output_valid_shifted
        valid_positions = np.where(valid_mask)[0]

        # Create index entries with actual file paths
        entries = [
            {
                "file_path": str(ts_file_path),
                "static_file_path": str(static_file_path),
                "gauge_id": gauge_id,
                "start_idx": idx,
                "end_idx": idx + total_seq_length,
                "input_end_date": dates[idx + input_length - 1],
                "valid_sequence": True,
            }
            for idx in valid_positions
            if idx + total_seq_length <= len(basin_data)
        ]

        all_index_entries.extend(entries)

    print(f"Index creation: {time.time() - index_start:.2f}s")
    print(f"Total time: {time.time() - start_time:.2f}s")
    print(f"Created {len(all_index_entries)} valid sequences")

    return all_index_entries


# Example usage:
data_folder = Path(
    "/Users/cooper/Desktop/CaravanifyParquet/USA/test_preprocessing/USA/processed_data"
)
static_file = Path("/path/to/static_attributes.csv")
index_entries = create_basin_index(basins, data_folder, static_file)

In [None]:
def read_parquet_range(file_path, start_idx, end_idx):
    # Use memory mapping for large files
    table = pq.read_table(
        file_path,
        memory_map=True,
        columns=[
            "date",
            "snow_depth_water_equivalent_mean",
            "surface_net_solar_radiation_mean",
            "surface_net_thermal_radiation_mean",
            "potential_evaporation_sum_ERA5_LAND",
            "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
            "temperature_2m_mean",
            "temperature_2m_min",
            "temperature_2m_max",
            "total_precipitation_sum",
        ],
    )
    sliced_table = table.slice(offset=start_idx, length=end_idx - start_idx)
    return sliced_table.to_pandas()


# Example usage
random_indices = random.sample(range(len(index_entries)), 2048)

start = time.time()
for i in tqdm(random_indices, desc="Reading ranges"):
    df = read_parquet_range(
        file_path=index_entries[i]["file_path"],
        start_idx=index_entries[i]["start_idx"],
        end_idx=index_entries[i]["end_idx"],
    )
    del df
print(f"Time taken to read ranges: {time.time() - start:.5f}s")