In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.data.lazy_datamodule import HydroLazyDataModule
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)

from hydro_forecasting.models.tide import LitTiDE, TiDEConfig


---

In [3]:
config = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan = CaravanifyParquet(config)
basin_ids = caravan.get_all_gauge_ids()

basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

In [4]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

In [5]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [6]:
datamodule = HydroLazyDataModule(
    path_to_time_series_directory="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    path_to_static_attributes_directory="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    path_to_preprocessing_output_directory="/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6",
    group_identifier="gauge_id",
    batch_size=2048,
    input_length=70,
    output_length=10,
    forcing_features=forcing_features,
    static_features=static_features,
    target="streamflow",
    preprocessing_configs=preprocessing_config,
    num_workers=4,
    min_train_years=5,
    train_prop=0.5,
    val_prop=0.25,
    test_prop=0.25,
    max_imputation_gap_size=5,
    list_of_gauge_ids_to_process=basin_ids,
    is_autoregressive=True,
)

## Let's try training a model

In [7]:
input_length = datamodule.input_length
output_length = datamodule.output_length

config = TiDEConfig(
    input_len=input_length,
    output_len=output_length,
    input_size=len(forcing_features),
    future_input_size=len(forcing_features),
    static_size=len(static_features),
    num_encoder_layers=2,
    num_decoder_layers=2,
    decoder_output_size=16,
    hidden_size=16,
    temporal_decoder_hidden_size=16,
    past_feature_projection_size=4,
    future_forcing_projection_size=4,
    use_layer_norm=True,
    dropout=0.1,
    learning_rate=1e-3,
)

# Instantiate the Lightning module.
model = LitTiDE(config)

In [8]:
import pytorch_lightning as pl

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    devices=1,
)

# Train the model
trainer.fit(model, datamodule)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


Found 77 basin files
Loading caravan attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_caravan_CA.parquet
Loading hydroatlas attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_hydroatlas_CA.parquet
Loading other attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_other_CA.parquet
Merging 3 attribute DataFrames
Loaded static attributes for 77 basins
Fitting preprocessing pipelines on all basins...


Loading basin data for fitting: 100%|██████████| 78/78 [00:00<00:00, 273.33it/s]


Found following uniquwe columns in sample data: ['date', 'snow_depth_water_equivalent_mean', 'surface_net_solar_radiation_mean', 'surface_net_thermal_radiation_mean', 'surface_pressure_mean', 'temperature_2m_mean', 'dewpoint_temperature_2m_mean', 'u_component_of_wind_10m_mean', 'v_component_of_wind_10m_mean', 'volumetric_soil_water_layer_1_mean', 'volumetric_soil_water_layer_2_mean', 'volumetric_soil_water_layer_3_mean', 'volumetric_soil_water_layer_4_mean', 'snow_depth_water_equivalent_min', 'surface_net_solar_radiation_min', 'surface_net_thermal_radiation_min', 'surface_pressure_min', 'temperature_2m_min', 'dewpoint_temperature_2m_min', 'u_component_of_wind_10m_min', 'v_component_of_wind_10m_min', 'volumetric_soil_water_layer_1_min', 'volumetric_soil_water_layer_2_min', 'volumetric_soil_water_layer_3_min', 'volumetric_soil_water_layer_4_min', 'snow_depth_water_equivalent_max', 'surface_net_solar_radiation_max', 'surface_net_thermal_radiation_max', 'surface_pressure_max', 'temperature

Processing basins: 100%|██████████| 77/77 [00:02<00:00, 38.33it/s]


Processing static attributes...
Saved transformed static attributes for 77 basins to /Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data/static_attributes.parquet
Processing complete. 77 basins retained out of 77.



  | Name          | Type      | Params | Mode 
----------------------------------------------------
0 | mse_criterion | MSELoss   | 0      | train
1 | model         | TiDEModel | 22.5 K | train
----------------------------------------------------
22.5 K    Trainable params
0         Non-trainable params
22.5 K    Total params
0.090     Total estimated model params size (MB)
61        Modules in train mode
0         Modules in eval mode


Created training dataset with 251719 samples
Created validation dataset with 131725 samples
Epoch 0: 100%|██████████| 123/123 [00:29<00:00,  4.21it/s, v_num=1]        

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 123/123 [00:29<00:00,  4.20it/s, v_num=1]


In [9]:
datamodule.index_entries

[{'file_path': '/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15013.parquet',
  'gauge_id': 'CA_15013',
  'start_idx': np.int64(70),
  'end_idx': np.int64(150),
  'input_end_date': np.datetime64('2000-05-20T00:00:00.000000000'),
  'valid_sequence': True,
  'stage': 'train',
  'static_file_path': PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data/static_attributes.parquet')},
 {'file_path': '/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15013.parquet',
  'gauge_id': 'CA_15013',
  'start_idx': np.int64(71),
  'end_idx': np.int64(151),
  'input_end_date': np.datetime64('2000-05-21T00:00:00.000000000'),
  'valid_sequence': True,
  'stage': 'train',
  'static_file_path': PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_static_data/static_attributes.parquet')},
 {'file_path': '/Users/cooper/Desktop/hydro-forecasting/tests/yolo_6/processed_data/CA_15013.parquet',
  'gauge_id': 'CA_150