In [1]:
!jupyter lab build

[LabBuildApp] JupyterLab 4.0.8
[LabBuildApp] Building in /usr/local/share/jupyter/lab
[LabBuildApp] ERROR | Build failed.
Troubleshooting: If the build failed due to an out-of-memory error, you
may be able to fix it by disabling the `dev_build` and/or `minimize` options.

If you are building via the `jupyter lab build` command, you can disable
these options like so:

jupyter lab build --dev-build=False --minimize=False

You can also disable these options for all JupyterLab builds by adding these
lines to a Jupyter config file named `jupyter_config.py`:

c.LabBuildApp.minimize = False
c.LabBuildApp.dev_build = False

If you don't already have a `jupyter_config.py` file, you can create one by
adding a blank file of that name to any of the Jupyter config directories.
The config directories can be listed by running:

jupyter --paths

Explanation:

- `dev-build`: This option controls whether a `dev` or a more streamlined
`production` build is used. This option will default to `False` (i.e.,

In [2]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent.parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

In [3]:
import polars as pl
import torch
from sklearn.pipeline import Pipeline
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.normalize import NormalizeTransformer
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.experiment_utils.train_model_from_scratch import train_model_from_scratch

  warn(


---

## Experiment constants

In [4]:
REGIONS = [
    "CL",
    "CH",
    "USA",
    "camelsaus",
    "camelsgb",
    "camelsbr",
    "hysets",
    "lamah",
]

COUNTRY = "tajikistan"




## Loading the data (as gauge_ids)

In [5]:
def load_basin_ids(regions, human_influence_path: str, human_influence_categories: list):
    """
    Simple function to load basin IDs from multiple regions and filter by human influence.

    Args:
        regions: List of region names to process
        human_influence_path: Path to human influence classification file
        human_influence_categories: Categories of human influence to keep

    Returns:
        List of filtered basin IDs
    """

    human_influence_path = Path(human_influence_path)

    basin_ids = []

    for region in regions:
        print(f"Loading basin IDs for region: {region}")
        # Set up paths
        attributes_dir = f"/workspace/CaravanifyParquet/{region}/post_processed/attributes"
        timeseries_dir = f"/workspace/CaravanifyParquet/{region}/post_processed/timeseries/csv"

        # Create config
        config = CaravanifyParquetConfig(
            attributes_dir=attributes_dir,
            timeseries_dir=timeseries_dir,
            human_influence_path=human_influence_path,
            gauge_id_prefix=region,
            use_hydroatlas_attributes=True,
            use_caravan_attributes=True,
            use_other_attributes=True,
        )

        # Get and filter basin IDs
        caravan = CaravanifyParquet(config)
        region_basin_ids = caravan.get_all_gauge_ids()

        # Filter by human influence if path provided
        if human_influence_path and human_influence_path.exists():
            filtered_ids, _ = caravan.filter_gauge_ids_by_human_influence(region_basin_ids, human_influence_categories)
        else:
            filtered_ids = region_basin_ids

        basin_ids.extend(filtered_ids)

    # Return unique, sorted basin IDs
    return sorted(set(basin_ids))


basin_ids = load_basin_ids(
    REGIONS,
    human_influence_path="/workspace/hydro-forecasting/scripts/human_influence_index/results/human_influence_classification.parquet",
    human_influence_categories=["Low", "Medium"],
)

Loading basin IDs for region: CL
Original gauge_ids: 505
Filtered gauge_ids: 487
Loading basin IDs for region: CH
Original gauge_ids: 135
Filtered gauge_ids: 0
No gauge_ids matched the specified human influence categories.
Loading basin IDs for region: USA
Original gauge_ids: 671
Filtered gauge_ids: 567
Loading basin IDs for region: camelsaus
Original gauge_ids: 222
Filtered gauge_ids: 215
Loading basin IDs for region: camelsgb
Original gauge_ids: 671
Filtered gauge_ids: 538
Loading basin IDs for region: camelsbr
Original gauge_ids: 870
Filtered gauge_ids: 868
Loading basin IDs for region: hysets
Original gauge_ids: 12162
Filtered gauge_ids: 8463
Loading basin IDs for region: lamah
Original gauge_ids: 859
Filtered gauge_ids: 812


## Datamodule Configs

In [6]:
region_time_series_base_dirs = {
    region: f"/workspace/CaravanifyParquet/{region}/post_processed/timeseries/csv/{region}"
    for region in REGIONS
}

region_static_attributes_base_dirs = {
    region: f"/workspace/CaravanifyParquet/{region}/post_processed/attributes/{region}" for region in REGIONS
}

path_to_preprocessing_output_directory = "/workspace/hydro-forecasting/experiments/low-medium-hii/data_cache"

In [7]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = "streamflow"

In [8]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [9]:
datamodule_config = {
    "region_time_series_base_dirs": region_time_series_base_dirs,
    "region_static_attributes_base_dirs": region_static_attributes_base_dirs,
    "path_to_preprocessing_output_directory": path_to_preprocessing_output_directory,
    "group_identifier": "gauge_id",
    "batch_size": 2048,
    "forcing_features": forcing_features,
    "static_features": static_features,
    "target": target,
    "num_workers": 10,
    "min_train_years": 10,
    "train_prop": 0.5,
    "val_prop": 0.25,
    "test_prop": 0.25,
    "max_imputation_gap_size": 5,
    "chunk_size": 1500,
    "validation_chunk_size": 4000,
    "is_autoregressive": True,
    "preprocessing_configs": preprocessing_config,
}

## Training Configs

In [10]:
training_config = {
    "max_epochs": 300,
    "accelerator": "cuda" if torch.cuda.is_available() else "cpu",
    "devices": 1,
    "early_stopping_patience": 30,
    "reload_dataloaders_every_n_epochs": 1,
}

## Remaining Configs

In [11]:
output_dir = "/workspace/hydro-forecasting/experiments/low-medium-hii"
model_types = ["tide", "ealstm", "tsmixer", "tft"]
yaml_paths = [
    f"/workspace/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tide.yaml",
    f"/workspace/hydro-forecasting/experiments/yaml-files/{COUNTRY}/ealstm.yaml",
    f"/workspace/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tsmixer.yaml",
    f"/workspace/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tft.yaml",
]
experiment_name = f"low-medium-hii_{COUNTRY}"
num_runs = 1
override_previous_attempts = False

## Training the models from scratch

In [None]:
train_results = train_model_from_scratch(
    gauge_ids=basin_ids,
    datamodule_config=datamodule_config,
    training_config=training_config,
    output_dir=output_dir,
    model_types=model_types,
    yaml_paths=yaml_paths,
    experiment_name=experiment_name,
    num_runs=num_runs,
    override_previous_attempts=override_previous_attempts,
)

INFO:hydro_forecasting.experiment_utils.train_model_from_scratch:Starting training for experiment 'low-medium-hii_tajikistan'
INFO:hydro_forecasting.experiment_utils.train_model_from_scratch:Output directory: /workspace/hydro-forecasting/experiments/low-medium-hii/low-medium-hii_tajikistan
INFO:hydro_forecasting.experiment_utils.train_model_from_scratch:Models to train: tide, ealstm, tsmixer, tft
INFO:hydro_forecasting.experiment_utils.train_model_from_scratch:Number of runs per model: 1
INFO:hydro_forecasting.experiment_utils.train_model_from_scratch:Processing model (1/4): tide
INFO:hydro_forecasting.data.in_memory_datamodule:Starting data preparation...
INFO:hydro_forecasting.data.in_memory_datamodule:Generated Run UUID for current config: e82931a3-45c7-56ef-a2a8-4f74f03e347d
INFO:hydro_forecasting.data.in_memory_datamodule:Checking for existing processed data at: /workspace/hydro-forecasting/experiments/low-medium-hii/data_cache/e82931a3-45c7-56ef-a2a8-4f74f03e347d
INFO:hydro_forec

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: The combination of `DataLoader(`pin_memory=True`, `persistent_workers=True`) and `Trainer(reload_dataloaders_every_n_epochs > 0)` can lead to instability due to limitations in PyTorch (https://github.com/pytorch/pytorch/issues/91252). We recommend setting `pin_memory=False` in this case.
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Train Dataloader using chunk 1/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12900815, 12). Est. Mem: 590.58 MB


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.161
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 1: Train Dataloader using chunk 2/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12937903, 12). Est. Mem: 592.28 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 1: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.083 >= min_delta = 0.0. New best score: 0.078
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 2: Train Dataloader using chunk 3/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12962450, 12). Est. Mem: 593.40 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 2: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.074
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 3: Train Dataloader using chunk 4/4 with 473 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 473 basins. Shape: (3023514, 12). Est. Mem: 138.41 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 3: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Completed full pass through training shared chunks. Recomputing.
INFO:hydro_forecasting.data.in_memory_datamodule:Initializing/Re-initializing training shared chunks from 6473 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Created 4 training shared chunks.
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 4: Train Dataloader using chunk 1/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12984509, 12). Est. Mem: 594.41 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 4: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.064
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 5: Train Dataloader using chunk 2/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12852909, 12). Est. Mem: 588.39 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 5: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.064
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 6: Train Dataloader using chunk 3/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12996859, 12). Est. Mem: 594.98 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 6: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.058
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 7: Train Dataloader using chunk 4/4 with 473 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 473 basins. Shape: (2990405, 12). Est. Mem: 136.90 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 7: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Completed full pass through training shared chunks. Recomputing.
INFO:hydro_forecasting.data.in_memory_datamodule:Initializing/Re-initializing training shared chunks from 6473 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Created 4 training shared chunks.
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 8: Train Dataloader using chunk 1/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12933175, 12). Est. Mem: 592.06 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 8: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 9: Train Dataloader using chunk 2/4 with 2000 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 2000 basins. Shape: (12859267, 12). Est. Mem: 588.68 MB
INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 9: Val Dataloader using cached validation data with 12472112 samples from 4000 basins.


Validation: |          | 0/? [00:00<?, ?it/s]