In [1]:
# !jupyter lab build

In [2]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent.parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [3]:
from sklearn.pipeline import Pipeline

from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.experiment_utils.finetune_pretrained_model import finetune_pretrained_models
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.normalize import NormalizeTransformer
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer

---

## Experiment constants

In [4]:
REGIONS = [
    "CA"
]

COUNTRY = "tajikistan"

## Loading the data (as gauge_ids)

In [5]:
def load_basin_ids(country: str) -> list[str]:
    """
    Function to load basins for a given country in Central Asia
    """
    # Make country lowercase and make the first letter uppercase
    country = country.lower()
    country = country.capitalize()

    if country != "Tajikistan" and country != "Kyrgyzstan":
        print("Country not supported")
        return []

    configs = CaravanifyParquetConfig(
        attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
        timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
        gauge_id_prefix="CA",
        use_hydroatlas_attributes=True,
        use_caravan_attributes=True,
        use_other_attributes=True,
    )

    caravan = CaravanifyParquet(configs)
    ca_basins = caravan.get_all_gauge_ids()
    caravan.load_stations(ca_basins)
    static_data = caravan.get_static_attributes()

    return list(static_data[static_data["country"] == country]["gauge_id"].unique())

basin_ids = load_basin_ids(COUNTRY)

## Datamodule Configs

In [6]:
region_time_series_base_dirs = {
    region: f"/Users/cooper/Desktop/CaravanifyParquet/{region}/post_processed/timeseries/csv/{region}"
    for region in REGIONS
}

region_static_attributes_base_dirs = {
    region: f"/Users/cooper/Desktop/CaravanifyParquet/{region}/post_processed/attributes/{region}" for region in REGIONS
}

path_to_preprocessing_output_directory = (
    "/Users/cooper/Desktop/hydro-forecasting/experiments/finetune/data_cache/{COUNTRY}"
)

In [7]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = "streamflow"

In [8]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [9]:
datamodule_config = {
    "region_time_series_base_dirs": region_time_series_base_dirs,
    "region_static_attributes_base_dirs": region_static_attributes_base_dirs,
    "path_to_preprocessing_output_directory": path_to_preprocessing_output_directory,
    "group_identifier": "gauge_id",
    "batch_size": 2048,
    "forcing_features": forcing_features,
    "static_features": static_features,
    "target": target,
    "num_workers": 4,
    "min_train_years": 5,
    "train_prop": 0.5,
    "val_prop": 0.25,
    "test_prop": 0.25,
    "max_imputation_gap_size": 5,
    "chunk_size": 100,
    "validation_chunk_size": 100,
    "is_autoregressive": True,
    "preprocessing_configs": preprocessing_config,
}

## Training Configs

In [10]:
training_config = {
    "max_epochs": 200,
    "accelerator": "mps",
    "devices": 1,
    "early_stopping_patience": 15,
    "reload_dataloaders_every_n_epochs": False,
}

## Remaining Configs

In [11]:
output_dir = "/Users/cooper/Desktop/hydro-forecasting/experiments/finetune"
pretrained_checkpoint_dir = (
    "/Users/cooper/Desktop/hydro-forecasting/experiments/low-medium-hii/low-medium-hii_tajikistan/checkpoints"
)

model_types = ["tide", "ealstm", "tsmixer"]
yaml_paths = [
    f"/Users/cooper/Desktop/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tide.yaml",
    f"/Users/cooper/Desktop/hydro-forecasting/experiments/yaml-files/{COUNTRY}/ealstm.yaml",
    f"/Users/cooper/Desktop/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tsmixer.yaml",
    # f"/Users/cooper/Desktop/hydro-forecasting/experiments/yaml-files/{COUNTRY}/tft.yaml",
]
experiment_name = f"finetune_{COUNTRY}"
num_runs = 1
override_previous_attempts = False

## Training the models from scratch

In [12]:
train_results = finetune_pretrained_models(
    gauge_ids=basin_ids,
    pretrained_checkpoint_dir=pretrained_checkpoint_dir,
    datamodule_config=datamodule_config,
    training_config=training_config,
    output_dir=output_dir,
    model_types=model_types,
    pretrained_yaml_paths=yaml_paths,
    experiment_name=experiment_name,
    num_runs=num_runs,
    override_previous_attempts=override_previous_attempts,
    lr_reduction_factor=10,
    select_best_from_pretrained=True
)

INFO:hydro_forecasting.experiment_utils.finetune_pretrained_model:Found pre-trained checkpoint for tide: /Users/cooper/Desktop/hydro-forecasting/experiments/low-medium-hii/low-medium-hii_tajikistan/checkpoints/tide/run_0/attempt_0/tide-run0-attempt_0-epoch=65-val_loss=0.0504.ckpt
INFO:hydro_forecasting.experiment_utils.finetune_pretrained_model:Found pre-trained checkpoint for ealstm: /Users/cooper/Desktop/hydro-forecasting/experiments/low-medium-hii/low-medium-hii_tajikistan/checkpoints/ealstm/run_0/attempt_0/ealstm-run0-attempt_0-epoch=141-val_loss=0.0349.ckpt
INFO:hydro_forecasting.experiment_utils.finetune_pretrained_model:Found pre-trained checkpoint for tsmixer: /Users/cooper/Desktop/hydro-forecasting/experiments/low-medium-hii/low-medium-hii_tajikistan/checkpoints/tsmixer/run_0/attempt_0/tsmixer-run0-attempt_0-epoch=135-val_loss=0.0388.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Starting experiment 'finetune_tajikistan'
INFO:hydro_forecasting.experiment_utils.tr

INFO: Processed 16 basins, 16 passed quality checks


INFO:hydro_forecasting.data.preprocessing:--- Finished Batch 1 ---
INFO:hydro_forecasting.data.preprocessing:Finished processing all time series batches. Attempted 16 basins.
INFO:hydro_forecasting.data.preprocessing:Fitted time series pipelines saved to /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/data_cache/{COUNTRY}/c7d32095-72fe-5f6e-9136-b4705bc8c2d3/fitted_time_series_pipelines.joblib
INFO:hydro_forecasting.data.preprocessing:Summary quality report saved to /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/data_cache/{COUNTRY}/c7d32095-72fe-5f6e-9136-b4705bc8c2d3/quality_summary.json
INFO:hydro_forecasting.data.preprocessing:SUCCESS: Preprocessing completed successfully. Output at /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/data_cache/{COUNTRY}/c7d32095-72fe-5f6e-9136-b4705bc8c2d3
INFO:hydro_forecasting.data.in_memory_datamodule:Hydro processor completed successfully.
INFO:hydro_forecasting.data.in_memory_datamodule:Successfully loaded 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Val Dataloader using cached validation data with 24877 samples from 16 basins.


                                                                           

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Train Dataloader using chunk 1/1 with 16 basins.




INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 16 basins. Shape: (53215, 12). Est. Mem: 2.44 MB
/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (26) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 26/26 [00:01<00:00, 14.77it/s, v_num=pt_0]

Metric val_loss improved. New best score: 0.034


Epoch 1: 100%|██████████| 26/26 [00:01<00:00, 24.59it/s, v_num=pt_0]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.033


Epoch 2: 100%|██████████| 26/26 [00:01<00:00, 22.34it/s, v_num=pt_0]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.033


Epoch 5: 100%|██████████| 26/26 [00:01<00:00, 19.48it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.032


Epoch 6: 100%|██████████| 26/26 [00:01<00:00, 22.72it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.032


Epoch 10: 100%|██████████| 26/26 [00:01<00:00, 19.80it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.032


Epoch 25: 100%|██████████| 26/26 [00:01<00:00, 24.63it/s, v_num=pt_0]

Monitored metric val_loss did not improve in the last 15 records. Best score: 0.032. Signaling Trainer to stop.


Epoch 25: 100%|██████████| 26/26 [00:01<00:00, 23.82it/s, v_num=pt_0]


INFO:hydro_forecasting.experiment_utils.training_runner:Run 0 completed with best val_loss: 0.03197910264134407
INFO:hydro_forecasting.experiment_utils.checkpoint_manager:Updated overall_best_model_info.txt at /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/finetune_tajikistan/checkpoints/tide to point to: run_0/attempt_0/tide-run0-attempt_0-epoch=10-val_loss=0.0320.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Overall best model for tide: run_0/attempt_0/tide-run0-attempt_0-epoch=10-val_loss=0.0320.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Best metrics: {'val_loss': 0.03197910264134407, 'run_index': 0, 'seed': 42}
INFO:hydro_forecasting.experiment_utils.training_runner:Processing model (2/3): ealstm
INFO:hydro_forecasting.experiment_utils.training_runner:Processing model: ealstm
INFO:hydro_forecasting.data.in_memory_datamodule:Starting data preparation...
INFO:hydro_forecasting.data.in_memory_datamodule:Generated Run UUID for current conf

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Val Dataloader using cached validation data with 25325 samples from 16 basins.


                                                                           

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Train Dataloader using chunk 1/1 with 16 basins.




INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 16 basins. Shape: (53215, 12). Est. Mem: 2.44 MB


Epoch 0: 100%|██████████| 26/26 [00:08<00:00,  3.04it/s, v_num=pt_0]

Metric val_loss improved. New best score: 0.028


Epoch 1: 100%|██████████| 26/26 [00:07<00:00,  3.28it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.027


Epoch 2: 100%|██████████| 26/26 [00:07<00:00,  3.27it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.027


Epoch 3: 100%|██████████| 26/26 [00:07<00:00,  3.28it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.027


Epoch 18: 100%|██████████| 26/26 [00:07<00:00,  3.30it/s, v_num=pt_0]

Monitored metric val_loss did not improve in the last 15 records. Best score: 0.027. Signaling Trainer to stop.


Epoch 18: 100%|██████████| 26/26 [00:07<00:00,  3.29it/s, v_num=pt_0]


INFO:hydro_forecasting.experiment_utils.training_runner:Run 0 completed with best val_loss: 0.026931406930088997
INFO:hydro_forecasting.experiment_utils.checkpoint_manager:Updated overall_best_model_info.txt at /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/finetune_tajikistan/checkpoints/ealstm to point to: run_0/attempt_0/ealstm-run0-attempt_0-epoch=03-val_loss=0.0269.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Overall best model for ealstm: run_0/attempt_0/ealstm-run0-attempt_0-epoch=03-val_loss=0.0269.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Best metrics: {'val_loss': 0.026931406930088997, 'run_index': 0, 'seed': 42}
INFO:hydro_forecasting.experiment_utils.training_runner:Processing model (3/3): tsmixer
INFO:hydro_forecasting.experiment_utils.training_runner:Processing model: tsmixer
INFO:hydro_forecasting.data.in_memory_datamodule:Starting data preparation...
INFO:hydro_forecasting.data.in_memory_datamodule:Generated Run UUID for 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Val Dataloader using cached validation data with 24477 samples from 16 basins.


                                                                           

INFO:hydro_forecasting.data.in_memory_datamodule:Epoch 0: Train Dataloader using chunk 1/1 with 16 basins.
INFO:hydro_forecasting.data.in_memory_datamodule:Stage 'train' chunk data loaded for 16 basins. Shape: (53215, 12). Est. Mem: 2.44 MB




/Users/cooper/Desktop/hydro-forecasting/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (25) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 25/25 [00:07<00:00,  3.50it/s, v_num=pt_0]

Metric val_loss improved. New best score: 0.032


Epoch 1: 100%|██████████| 25/25 [00:05<00:00,  4.19it/s, v_num=pt_0]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.030


Epoch 3: 100%|██████████| 25/25 [00:05<00:00,  4.17it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.030


Epoch 6: 100%|██████████| 25/25 [00:05<00:00,  4.23it/s, v_num=pt_0]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.030


Epoch 21: 100%|██████████| 25/25 [00:05<00:00,  4.24it/s, v_num=pt_0]

Monitored metric val_loss did not improve in the last 15 records. Best score: 0.030. Signaling Trainer to stop.


Epoch 21: 100%|██████████| 25/25 [00:05<00:00,  4.20it/s, v_num=pt_0]


INFO:hydro_forecasting.experiment_utils.training_runner:Run 0 completed with best val_loss: 0.02990146353840828
INFO:hydro_forecasting.experiment_utils.checkpoint_manager:Updated overall_best_model_info.txt at /Users/cooper/Desktop/hydro-forecasting/experiments/finetune/finetune_tajikistan/checkpoints/tsmixer to point to: run_0/attempt_0/tsmixer-run0-attempt_0-epoch=06-val_loss=0.0299.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Overall best model for tsmixer: run_0/attempt_0/tsmixer-run0-attempt_0-epoch=06-val_loss=0.0299.ckpt
INFO:hydro_forecasting.experiment_utils.training_runner:Best metrics: {'val_loss': 0.02990146353840828, 'run_index': 0, 'seed': 42}
