In [1]:
import sys
from pathlib import Path
import torch 
import pandas as pd

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.data.lazy_datamodule import HydroLazyDataModule
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)

from hydro_forecasting.models.ealstm import LitEALSTM, EALSTMConfig
from hydro_forecasting.model_evaluation.evaluators import TSForecastEvaluator
from hydro_forecasting.model_evaluation.hp_from_yaml import hp_from_yaml


---

In [3]:
yaml_path = Path("/Users/cooper/Desktop/hydro-forecasting/notebooks/ealstm.yaml")

ealstm_hp = hp_from_yaml("ealstm", yaml_path)
ealstm_hp

{'bias': True,
 'bidirectional': True,
 'bidirectional_fusion': 'concat',
 'dropout': 0.09091248360355031,
 'future_hidden_size': 79,
 'future_input_size': 9,
 'future_layers': 3,
 'group_identifier': 'gauge_id',
 'hidden_size': 79,
 'input_len': 36,
 'input_size': 10,
 'learning_rate': 0.0008706020878304854,
 'num_layers': 3,
 'output_len': 10,
 'scheduler_factor': 0.5,
 'scheduler_patience': 5,
 'static_size': 10}

In [4]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:10]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

In [5]:
# config_us = CaravanifyParquetConfig(
#     attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
#     timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
#     shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
#     gauge_id_prefix="USA",
#     use_hydroatlas_attributes=True,
#     use_caravan_attributes=True,
#     use_other_attributes=True,
# )

# caravan_us = CaravanifyParquet(config_us)
# basin_ids += caravan_us.get_all_gauge_ids()[:10]

In [6]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

In [7]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [8]:
region_time_series_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv/USA",
}

region_static_attributes_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes/USA",
}

datamodule = HydroLazyDataModule(
    region_time_series_base_dirs=region_time_series_base_dirs,
    region_static_attributes_base_dirs=region_static_attributes_base_dirs,
    path_to_preprocessing_output_directory="/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6",
    group_identifier="gauge_id",
    batch_size=2048,
    input_length=ealstm_hp["input_len"],
    output_length=ealstm_hp["output_len"],
    forcing_features=forcing_features,
    static_features=static_features,
    target="streamflow",
    preprocessing_configs=preprocessing_config,
    num_workers=4,
    min_train_years=5,
    train_prop=0.5,
    val_prop=0.25,
    test_prop=0.25,
    max_imputation_gap_size=5,
    list_of_gauge_ids_to_process=basin_ids,
    is_autoregressive=True,
    files_per_batch=20,
)

datamodule.prepare_data()
datamodule.setup()



INFO: Attempting to load static attributes for 10 gauge IDs
INFO: Processing static attributes for region 'CA'
INFO: Loaded caravan attributes for 10 gauges in CA
INFO: Loaded hydroatlas attributes for 10 gauges in CA
INFO: Loaded other attributes for 10 gauges in CA
INFO: Horizontally merging 3 attribute files for region 'CA'
INFO: Vertically stacking attribute data from 1 regions
SUCCESS: Loaded and merged static attributes for 10 unique basins from 3 files across 1 regions.
INFO: Successfully loaded static attributes for 10 basins.

INFO: Loading time series data for pipeline fitting...
INFO: Loaded time series data for 10 basins
INFO: Split time series data into train (33798), val (16896), test (16910)
INFO: Fitted 3 pipelines



Processing basins: 100%|██████████| 10/10 [00:00<00:00, 11.93it/s]



INFO: Processing static attributes...
SUCCESS: Saved transformed static attributes for 9 basins to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data.parquet

SUCCESS: Completed processing 9 of 10 basins
INFO: Created training dataset with 32429 samples
INFO: Created validation dataset with 16435 samples
INFO: Created test dataset with 16249 samples


## Verify static data

In [9]:
test_dataset = datamodule.test_dataset
if not test_dataset:
    print("Test dataset not found.")
elif len(test_dataset) == 0:
    print("Test dataset is empty.")
else:
    print(f"Test dataset size: {len(test_dataset)}")
    # --- Get a Sample ---
    sample_index = 1654
    print(f"Getting sample at index {sample_index}...")
    try:
        sample = test_dataset[sample_index]

        # --- Check for NaNs in the Sample Tensors ---
        print("\n--- Checking for NaNs in sample tensors ---")
        for key, tensor in sample.items():
            if isinstance(tensor, torch.Tensor):
                has_nan = torch.isnan(tensor).any().item()
                print(f"Tensor'{key}' shape: {tensor.shape}, Contains NaNs: {has_nan}")
                print(f"  Sample tensor '{key}': {tensor[:5]}")
                if has_nan:
                    # Optional: Print where NaNs occur
                    nan_indices = torch.nonzero(torch.isnan(tensor))
                    print(f"  NaN indices in '{key}': {nan_indices.tolist()[:5]}...") # Print first 5
            else:
                print(f"Item '{key}' is not a tensor (type: {type(tensor)})")

    except IndexError:
        print(f"Error: Index {sample_index} out of bounds for dataset size {len(test_dataset)}")
    except Exception as e:
        print(f"An error occurred while getting or checking the sample: {e}")

Test dataset size: 16249
Getting sample at index 1654...

--- Checking for NaNs in sample tensors ---
Tensor'X' shape: torch.Size([36, 10]), Contains NaNs: False
  Sample tensor 'X': tensor([[-0.6782, -0.8469, -0.8102,  0.5967, -0.8725,  0.9365, -0.4239, -0.4371,
         -0.2749, -0.5279],
        [-0.6950, -0.8764, -0.8102,  0.6090, -0.9294,  1.1605, -0.4695, -0.4624,
         -0.4882,  0.8459],
        [-0.7062, -0.9207, -0.9600,  0.7004, -1.0301,  0.4864, -1.0457, -0.8347,
         -1.2916,  0.4341],
        [-0.7229, -0.8519, -0.8851,  0.7068, -0.8755, -0.4745, -1.2536, -1.4113,
         -1.3374, -0.6357],
        [-0.7341, -0.9059, -0.8768,  0.7066, -0.8907, -0.1430, -1.0498, -1.4568,
         -1.5299, -0.6357]])
Tensor'y' shape: torch.Size([10]), Contains NaNs: False
  Sample tensor 'y': tensor([-0.7006, -0.6894, -0.6782, -0.6671, -0.6726])
Tensor'static' shape: torch.Size([10]), Contains NaNs: False
  Sample tensor 'static': tensor([-0.2210,  2.7662,  2.5964,  0.7564, -0.9938])

## Let's try training a model

In [10]:
input_length = datamodule.input_length
output_length = datamodule.output_length

config = EALSTMConfig(**ealstm_hp)


# Instantiate the Lightning module.
model = LitEALSTM(config)

In [11]:
import pytorch_lightning as pl

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    devices=1,
)

# Train the model
trainer.fit(model, datamodule)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


INFO: prepare_data() has already been run; skipping.



  | Name          | Type     | Params | Mode 
---------------------------------------------------
0 | mse_criterion | MSELoss  | 0      | train
1 | model         | BiEALSTM | 191 K  | train
---------------------------------------------------
191 K     Trainable params
0         Non-trainable params
191 K     Total params
0.767     Total estimated model params size (MB)
58        Modules in train mode
0         Modules in eval mode


INFO: Created training dataset with 32429 samples
INFO: Created validation dataset with 16435 samples
Sanity Checking: |          | 0/? [00:00<?, ?it/s]NaNs found in target tensor y for index 1652
NaNs found in target tensor y for index 1653
NaNs found in target tensor y for index 1654
NaNs found in target tensor y for index 1655
NaNs found in target tensor y for index 1656
NaNs found in target tensor y for index 1657
NaNs found in target tensor y for index 1658
NaNs found in target tensor y for index 1659
NaNs found in target tensor y for index 1660
NaNs found in target tensor y for index 1661
NaNs found in target tensor y for index 10075                              
NaNs found in target tensor y for index 10076
NaNs found in target tensor y for index 10077
NaNs found in target tensor y for index 10078
NaNs found in target tensor y for index 10079
NaNs found in target tensor y for index 10080
NaNs found in target tensor y for index 10081
NaNs found in target tensor y for index 10082



Detected KeyboardInterrupt, attempting graceful shutdown ...
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/cooper/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cooper/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/cooper/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/cooper/Desktop/hydro-forecasting/src/hydro_forecasting/data/lazy_datamodule.py", line 3, in <module>
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^

RuntimeError: Please call `iter(combined_loader)` first.

In [None]:
models_and_datamodules = {
    "EALSTM": (model, datamodule),
}

evaluator = TSForecastEvaluator(
    horizons=list(range(1, output_length + 1)),
    models_and_datamodules=models_and_datamodules,
    trainer_kwargs={
        "accelerator": "cpu",
        "devices": 1,
    },
)

In [None]:
results = evaluator.test_models()

In [None]:
results["EALSTM"]["df"]