In [12]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


In [13]:
from hydro_forecasting.data.datamodule import HydroInMemoryDataModule 
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.preprocessing.normalize import NormalizeTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)

from hydro_forecasting.models.tide import LitTiDE, TiDEConfig
from hydro_forecasting.model_evaluation.hp_from_yaml import hp_from_yaml

from hydro_forecasting.model_evaluation.evaluators import TSForecastEvaluator

import polars as pl

---

In [14]:
yaml_path = Path("/Users/cooper/Desktop/hydro-forecasting/notebooks/tide.yaml")

tide_hp = hp_from_yaml("tide", yaml_path)
tide_hp

{'decoder_output_size': 24,
 'dropout': 0.4040330172235821,
 'future_forcing_projection_size': 0,
 'future_input_size': 9,
 'group_identifier': 'gauge_id',
 'hidden_size': 110,
 'input_len': 34,
 'input_size': 10,
 'learning_rate': 0.00029399848560567596,
 'num_decoder_layers': 2,
 'num_encoder_layers': 2,
 'output_len': 10,
 'past_feature_projection_size': 0,
 'scheduler_factor': 0.5,
 'scheduler_patience': 5,
 'static_size': 10,
 'temporal_decoder_hidden_size': 51,
 'use_layer_norm': False}

In [15]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:10]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

In [16]:
config_us = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_us = CaravanifyParquet(config_us)
basin_ids += caravan_us.get_all_gauge_ids()[:10]

In [17]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = "streamflow"

In [18]:
feature_pipeline = GroupedPipeline(
    Pipeline(
        [("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]
    ),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline(
        [("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]
    ),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [19]:
region_time_series_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv/USA",
}

region_static_attributes_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes/USA",
}

DATA_CHUNK_SIZE = 10 
RECOMPUTE_CHUNKS_EVERY = 10 

datamodule = HydroInMemoryDataModule(
    region_time_series_base_dirs=region_time_series_base_dirs,
    region_static_attributes_base_dirs=region_static_attributes_base_dirs,
    path_to_preprocessing_output_directory="/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6", # Base dir for processed data
    group_identifier="gauge_id",
    batch_size=2048,
    input_length=tide_hp["input_len"],
    output_length=tide_hp["output_len"],
    forcing_features=forcing_features,
    static_features=static_features,
    target=target,
    preprocessing_configs=preprocessing_config, # Pass the pipeline config dict
    num_workers=4,
    min_train_years=5,
    train_prop=0.5,
    val_prop=0.25,
    test_prop=0.25,
    max_imputation_gap_size=5,
    list_of_gauge_ids_to_process=basin_ids, # List of ALL basins you intend to use
    is_autoregressive=True,
    chunk_size=DATA_CHUNK_SIZE,             # New argument
    recompute_every=RECOMPUTE_CHUNKS_EVERY, # New argument
    # load_engine='polars' # Optional: keep default or specify 'pyarrow'
)

datamodule.prepare_data()
datamodule.setup()

INFO:hydro_forecasting.data.datamodule:Starting data preparation...
INFO:hydro_forecasting.data.datamodule:Generated Run UUID for current config: a0aaa4c5-598c-52b9-a814-edd6bec93b02
INFO:hydro_forecasting.data.datamodule:Checking for existing processed data at: /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02
INFO:hydro_forecasting.data.datamodule:No reusable data found or reuse failed for UUID a0aaa4c5-598c-52b9-a814-edd6bec93b02. Reason: Run directory not found.. Running preprocessing...
INFO:hydro_forecasting.data.preprocessing:Config saved to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02/config.json
INFO:hydro_forecasting.data.preprocessing:Processing static features...
INFO:hydro_forecasting.data.preprocessing:Static features saved to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02/processed_static_features.parquet
INFO:hydro_forecasting.data.preprocessin

INFO: Processed 20 basins, 19 passed quality checks


INFO:hydro_forecasting.data.preprocessing:--- Finished Batch 1 ---
INFO:hydro_forecasting.data.preprocessing:Finished processing all time series batches. Attempted 20 basins.
INFO:hydro_forecasting.data.preprocessing:Fitted time series pipelines saved to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02/fitted_time_series_pipelines.joblib
INFO:hydro_forecasting.data.preprocessing:Summary quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02/quality_summary.json
INFO:hydro_forecasting.data.preprocessing:SUCCESS: Preprocessing completed successfully. Output at /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/a0aaa4c5-598c-52b9-a814-edd6bec93b02
INFO:hydro_forecasting.data.datamodule:Hydro processor completed successfully.
INFO:hydro_forecasting.data.datamodule:Loading fitted pipelines...
INFO:hydro_forecasting.data.datamodule:Successfully loaded 3 categories of fitted pipelines.
INFO:h

## Verify static data

In [None]:
# test_dataset = datamodule.test_dataset
# if not test_dataset:
#     print("Test dataset not found.")
# elif len(test_dataset) == 0:
#     print("Test dataset is empty.")
# else:
#     print(f"Test dataset size: {len(test_dataset)}")
#     # --- Get a Sample ---
#     sample_index = 1654
#     print(f"Getting sample at index {sample_index}...")
#     try:
#         sample = test_dataset[sample_index]

#         # --- Check for NaNs in the Sample Tensors ---
#         print("\n--- Checking for NaNs in sample tensors ---")
#         for key, tensor in sample.items():
#             if isinstance(tensor, torch.Tensor):
#                 has_nan = torch.isnan(tensor).any().item()
#                 print(f"Tensor'{key}' shape: {tensor.shape}, Contains NaNs: {has_nan}")
#                 print(f"  Sample tensor '{key}': {tensor[:5]}")
#                 if has_nan:
#                     # Optional: Print where NaNs occur
#                     nan_indices = torch.nonzero(torch.isnan(tensor))
#                     print(f"  NaN indices in '{key}': {nan_indices.tolist()[:5]}...") # Print first 5
#             else:
#                 print(f"Item '{key}' is not a tensor (type: {type(tensor)})")

#     except IndexError:
#         print(f"Error: Index {sample_index} out of bounds for dataset size {len(test_dataset)}")
#     except Exception as e:
#         print(f"An error occurred while getting or checking the sample: {e}")

In [None]:
# ie = datamodule.val_index_entries[1661]

# file_path = ie["file_path"]
# start_idx = ie["start_idx"]
# end_idx = ie["end_idx"]
# gauge_id = ie["gauge_id"]

# data = pd.read_parquet(file_path)

# # Slice the data
# data_slice = data.iloc[start_idx:end_idx]
# print(f"Data slice shape: {data_slice.shape}")

# data_slice["streamflow"]

## Let's try training a model

In [20]:
input_length = datamodule.input_length
output_length = datamodule.output_length

config = TiDEConfig(**tide_hp)


# Instantiate the Lightning module.
model = LitTiDE(config)

In [21]:
import pytorch_lightning as pl

trainer = pl.Trainer(
    max_epochs=5,
    accelerator="gpu",
    devices=1,
)

# Train the model
trainer.fit(model, datamodule)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
INFO:hydro_forecasting.data.datamodule:Data preparation has already run.

  | Name          | Type      | Params | Mode 
----------------------------------------------------
0 | mse_criterion | MSELoss   | 0      | train
1 | model         | TiDEModel | 250 K  | train
----------------------------------------------------
250 K     Trainable params
0         Non-trainable params
250 K     Total params
1.001     Total estimated model params size (MB)
40        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.datamodule:Loading validation data for 19 basins...
INFO:hydro_forecasting.data.in_memory_dataset:Loading chunk for stage 'val' with 19 basins...
INFO:hydro_forecasting.data.in_memory_dataset:Chunk data loaded. Shape: (45866, 12). Memory usage: 2.10 MB
INFO:hydro_forecasting.data.in_memory_dataset:Precomputing index for in-memory chunk...
INFO:hydro_forecasting.data.in_memory_dataset:Chunk loading and indexing complete for stage 'val'. Processed 19, Skipped 0. Found 0 valid samples. Time: 0.21s
/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


                                                  

/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `DataLoader` across ranks is zero. Please make sure this was your intention.
INFO:hydro_forecasting.data.datamodule:Epoch 0: Loading training chunk 1/2 with 10 basins.
INFO:hydro_forecasting.data.in_memory_dataset:Loading chunk for stage 'train' with 10 basins...
INFO:hydro_forecasting.data.in_memory_dataset:Chunk data loaded. Shape: (44200, 12). Memory usage: 2.02 MB
INFO:hydro_forecasting.data.in_memory_dataset:Precomputing index for in-memory chunk...
INFO:hydro_forecasting.data.in_memory_dataset:Chunk loading and indexing complete for stage 'train'. Processed 10, Skipped 0. Found 0 valid samples. Time: 0.12s




ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
models_and_datamodules = {
    "TiDE": (model, datamodule),
}

evaluator = TSForecastEvaluator(
    horizons=list(range(1, output_length + 1)),
    models_and_datamodules=models_and_datamodules,
    trainer_kwargs={
        "accelerator": "cpu",
        "devices": 1,
    },
)

In [None]:
results = evaluator.test_models()

In [None]:
df = results["TiDE"]["df"]

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

basin_id = "CA_15013"
df_basin = df[df["basin_id"] == basin_id]

plt.figure(figsize=(12, 6))
plt.plot(df_basin["date"], df_basin["prediction"], label="Prediction", color="blue")
plt.plot(df_basin["date"], df_basin["observed"], label="Observed", color="orange")
plt.title(f"Observed vs Prediction for {basin_id}")
plt.xlabel("Date")
plt.ylabel("Streamflow")
plt.legend()
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
plt.gcf().autofmt_xdate()
sns.despine()
plt.show()