In [1]:
import sys
from pathlib import Path
import torch 
import pandas as pd

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.data.lazy_datamodule import HydroLazyDataModule
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)

from hydro_forecasting.models.ealstm import LitEALSTM, EALSTMConfig
from hydro_forecasting.model_evaluation.evaluators import TSForecastEvaluator
from hydro_forecasting.model_evaluation.hp_from_yaml import hp_from_yaml


---

In [3]:
yaml_path = Path("/Users/cooper/Desktop/hydro-forecasting/notebooks/ealstm.yaml")

ealstm_hp = hp_from_yaml("ealstm", yaml_path)
ealstm_hp

{'bias': True,
 'bidirectional': True,
 'bidirectional_fusion': 'concat',
 'dropout': 0.09091248360355031,
 'future_hidden_size': 79,
 'future_input_size': 9,
 'future_layers': 3,
 'group_identifier': 'gauge_id',
 'hidden_size': 79,
 'input_len': 36,
 'input_size': 10,
 'learning_rate': 0.0008706020878304854,
 'num_layers': 3,
 'output_len': 10,
 'scheduler_factor': 0.5,
 'scheduler_patience': 5,
 'static_size': 10}

In [4]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:10]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

In [5]:
# config_us = CaravanifyParquetConfig(
#     attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
#     timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
#     shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
#     gauge_id_prefix="USA",
#     use_hydroatlas_attributes=True,
#     use_caravan_attributes=True,
#     use_other_attributes=True,
# )

# caravan_us = CaravanifyParquet(config_us)
# basin_ids += caravan_us.get_all_gauge_ids()[:10]

In [6]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

In [7]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [8]:
region_time_series_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv/USA",
}

region_static_attributes_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes/USA",
}

datamodule = HydroLazyDataModule(
    region_time_series_base_dirs=region_time_series_base_dirs,
    region_static_attributes_base_dirs=region_static_attributes_base_dirs,
    path_to_preprocessing_output_directory="/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6",
    group_identifier="gauge_id",
    batch_size=2048,
    input_length=ealstm_hp["input_len"],
    output_length=ealstm_hp["output_len"],
    forcing_features=forcing_features,
    static_features=static_features,
    target="streamflow",
    preprocessing_configs=preprocessing_config,
    num_workers=4,
    min_train_years=5,
    train_prop=0.5,
    val_prop=0.25,
    test_prop=0.25,
    max_imputation_gap_size=5,
    list_of_gauge_ids_to_process=basin_ids,
    is_autoregressive=True,
    files_per_batch=20,
)

datamodule.prepare_data()
datamodule.setup()



INFO: Attempting to load static attributes for 10 gauge IDs
INFO: Processing static attributes for region 'CA'
INFO: Loaded caravan attributes for 10 gauges in CA
INFO: Loaded hydroatlas attributes for 10 gauges in CA
INFO: Loaded other attributes for 10 gauges in CA
INFO: Horizontally merging 3 attribute files for region 'CA'
INFO: Vertically stacking attribute data from 1 regions
SUCCESS: Loaded and merged static attributes for 10 unique basins from 3 files across 1 regions.
INFO: Successfully loaded static attributes for 10 basins.

INFO: Loading time series data for pipeline fitting...
INFO: Loaded time series data for 10 basins
INFO: Split time series data into train (33798), val (16896), test (16910)
INFO: Fitted 3 pipelines



Processing basins: 100%|██████████| 10/10 [00:00<00:00, 12.53it/s]



INFO: Processing static attributes...
SUCCESS: Saved transformed static attributes for 9 basins to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data.parquet

SUCCESS: Completed processing 9 of 10 basins
INFO: Created training dataset with 32429 samples
INFO: Created validation dataset with 16435 samples
INFO: Created test dataset with 16249 samples


## Verify static data

In [17]:
test_dataset = datamodule.test_dataset
if not test_dataset:
    print("Test dataset not found.")
elif len(test_dataset) == 0:
    print("Test dataset is empty.")
else:
    print(f"Test dataset size: {len(test_dataset)}")
    # --- Get a Sample ---
    sample_index = 10
    print(f"Getting sample at index {sample_index}...")
    try:
        sample = test_dataset[sample_index]

        # --- Check for NaNs in the Sample Tensors ---
        print("\n--- Checking for NaNs in sample tensors ---")
        for key, tensor in sample.items():
            if isinstance(tensor, torch.Tensor):
                has_nan = torch.isnan(tensor).any().item()
                print(f"{type(tensor)} '{key}' shape: {tensor.shape}, Contains NaNs: {has_nan}")
                if has_nan:
                    # Optional: Print where NaNs occur
                    nan_indices = torch.nonzero(torch.isnan(tensor))
                    print(f"  NaN indices in '{key}': {nan_indices.tolist()[:5]}...") # Print first 5
            else:
                print(f"Item '{key}' is not a tensor (type: {type(tensor)})")

    except IndexError:
        print(f"Error: Index {sample_index} out of bounds for dataset size {len(test_dataset)}")
    except Exception as e:
        print(f"An error occurred while getting or checking the sample: {e}")

Test dataset size: 16249
Getting sample at index 10...

--- Checking for NaNs in sample tensors ---
<class 'torch.Tensor'> 'X' shape: torch.Size([36, 10]), Contains NaNs: False
<class 'torch.Tensor'> 'y' shape: torch.Size([10]), Contains NaNs: False
<class 'torch.Tensor'> 'static' shape: torch.Size([10]), Contains NaNs: False
<class 'torch.Tensor'> 'future' shape: torch.Size([10, 9]), Contains NaNs: False
Item 'gauge_id' is not a tensor (type: <class 'str'>)
Item 'input_end_date' is not a tensor (type: <class 'int'>)


## Verify time series data

In [10]:
path_to_ts = Path("/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data")

# Verify that the "streamflow" column exists in the .parquet files and does not contain NaNs
def check_streamflow_in_parquet_files(path_to_ts):
    for file in path_to_ts.glob("*.parquet"):
        df = pd.read_parquet(file)
        if "streamflow" not in df.columns:
            print(f"'streamflow' column not found in {file}")
        else:
            if df["streamflow"].isnull().any():
                print(f"NaN values found in 'streamflow' column of {file}")
            else:
                print(f"'streamflow' column is valid in {file}")
check_streamflow_in_parquet_files(path_to_ts)

'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/USA_01057000.parquet
NaN values found in 'streamflow' column of /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15034.parquet
'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/USA_01022500.parquet
'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15025.parquet
NaN values found in 'streamflow' column of /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15013.parquet
'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/USA_01055000.parquet
'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/USA_01073000.parquet
'streamflow' column is valid in /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/USA_01054200.parquet
NaN values found in 's

## Let's try training a model

In [11]:
input_length = datamodule.input_length
output_length = datamodule.output_length

config = EALSTMConfig(**ealstm_hp)


# Instantiate the Lightning module.
model = LitEALSTM(config)

In [12]:
import pytorch_lightning as pl

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    devices=1,
)

# Train the model
trainer.fit(model, datamodule)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs




INFO: Attempting to load static attributes for 9 gauge IDs
INFO: Processing static attributes for region 'CA'
INFO: Loaded caravan attributes for 9 gauges in CA
INFO: Loaded hydroatlas attributes for 9 gauges in CA
INFO: Loaded other attributes for 9 gauges in CA
INFO: Horizontally merging 3 attribute files for region 'CA'
INFO: Vertically stacking attribute data from 1 regions
SUCCESS: Loaded and merged static attributes for 9 unique basins from 3 files across 1 regions.
INFO: Successfully loaded static attributes for 9 basins.

INFO: Loading time series data for pipeline fitting...
INFO: Loaded time series data for 9 basins
INFO: Split time series data into train (33153), val (16574), test (16586)
INFO: Fitted 3 pipelines



Processing basins: 100%|██████████| 9/9 [00:00<00:00, 11.50it/s]



INFO: Processing static attributes...
SUCCESS: Saved transformed static attributes for 9 basins to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data.parquet

SUCCESS: Completed processing 9 of 9 basins



  | Name          | Type     | Params | Mode 
---------------------------------------------------
0 | mse_criterion | MSELoss  | 0      | train
1 | model         | BiEALSTM | 191 K  | train
---------------------------------------------------
191 K     Trainable params
0         Non-trainable params
191 K     Total params
0.767     Total estimated model params size (MB)
58        Modules in train mode
0         Modules in eval mode


INFO: Created training dataset with 32429 samples
INFO: Created validation dataset with 16435 samples
                                                                           

/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (16) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 16/16 [00:07<00:00,  2.15it/s, v_num=43]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 16/16 [00:07<00:00,  2.14it/s, v_num=43]


In [13]:
models_and_datamodules = {
    "EALSTM": (model, datamodule),
}

evaluator = TSForecastEvaluator(
    horizons=list(range(1, output_length + 1)),
    models_and_datamodules=models_and_datamodules,
    trainer_kwargs={
        "accelerator": "cpu",
        "devices": 1,
    },
)

In [14]:
results = evaluator.test_models()

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Testing EALSTM...


INFO: Attempting to load static attributes for 9 gauge IDs
INFO: Processing static attributes for region 'CA'
INFO: Loaded caravan attributes for 9 gauges in CA
INFO: Loaded hydroatlas attributes for 9 gauges in CA
INFO: Loaded other attributes for 9 gauges in CA
INFO: Horizontally merging 3 attribute files for region 'CA'
INFO: Vertically stacking attribute data from 1 regions
SUCCESS: Loaded and merged static attributes for 9 unique basins from 3 files across 1 regions.
INFO: Successfully loaded static attributes for 9 basins.

INFO: Loading time series data for pipeline fitting...
INFO: Loaded time series data for 9 basins
INFO: Split time series data into train (33153), val (16574), test (16586)
INFO: Fitted 3 pipelines



/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Processing basins: 100%|██████████| 9/9 [00:00<00:00, 11.17it/s]



INFO: Processing static attributes...
SUCCESS: Saved transformed static attributes for 9 basins to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data.parquet

SUCCESS: Completed processing 9 of 9 basins
INFO: Created test dataset with 16249 samples
Testing DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.52it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss                   nan
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
DEBUG: Raw predictions contain NaNs: True
DEBUG: Raw observations contain NaNs: True
Evaluating results with shape: preds=(16249, 10), obs=(16249, 10), basin_ids=(16249,)


In [18]:
results["EALSTM"]["df"]

Unnamed: 0,horizon,prediction,observed,basin_id,date
0,1,,8.000000,CA_15013,2017-05-28
1,2,,7.500000,CA_15013,2017-05-29
2,3,,7.500000,CA_15013,2017-05-30
3,4,,6.559999,CA_15013,2017-05-31
4,5,,6.559999,CA_15013,2017-06-01
...,...,...,...,...,...
162485,6,,0.860000,CA_15044,2022-12-27
162486,7,,0.860000,CA_15044,2022-12-28
162487,8,,0.860000,CA_15044,2022-12-29
162488,9,,0.860000,CA_15044,2022-12-30
