In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import time

In [2]:
import optuna
from pytorch_lightning.callbacks import EarlyStopping
import pytorch_lightning as pl

In [3]:
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor,
)
import torch
from torch.nn import MSELoss
from torch.optim import Adam
import matplotlib.pyplot as plt
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import glob
from pathlib import Path

from src.data_models.camels_ch import CamelsCH, CamelsCHConfig, get_all_gauge_ids
from src.data_models.dataset import HydroDataset
from src.data_models.preprocessing import (
    scale_time_series,
    scale_static_attributes,
    inverse_scale_static_attributes,
    inverse_scale_time_series,
)
from src.data_models.caravanify import Caravanify, CaravanifyConfig

from utils.metrics import nash_sutcliffe_efficiency
from src.data_models.datamodule import HydroDataModule

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from src.preprocessing.transformers import GroupedTransformer, LogTransformer

In [4]:
from src.models.lstm import LitLSTM
from src.models.ealstm import LitEALSTM
from src.models.TSMixer import LitTSMixer
from src.models.evaluators import TSForecastEvaluator
from torch.optim import Adam
from torch.nn import MSELoss

---

In [5]:
config = CaravanifyConfig(
    attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/timeseries/csv",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)


caravan = Caravanify(config)
ids_for_training = caravan.get_all_gauge_ids()[:3]

print(f"Total number of stations: {len(ids_for_training)}")

caravan.load_stations(ids_for_training)

ts_data = caravan.get_time_series()
static_data = caravan.get_static_attributes()

Total number of stations: 3


In [6]:
ts_data["date"] = pd.to_datetime(ts_data["date"])

ts_data["julian_day"] = ts_data["date"].dt.dayofyear

ts_columns = [
    # "potential_evaporation_sum_ERA5_LAND",
    # "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "streamflow",
    # "julian_day",
    # "temperature_2m_mean",
    "total_precipitation_sum",
]

In [7]:
whole_data = ts_columns + ["gauge_id", "date"]
ts_data = ts_data[whole_data]

In [8]:
statics_to_keep = [
    "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

static_columns = static_data.columns
static_columns = [col for col in list(static_columns) if col in statics_to_keep]

static_data = static_data[static_columns]

In [9]:
features = [
    col for col in ts_data.columns if col not in ["gauge_id", "date", "streamflow"]
]
ts_columns = features + ["streamflow"]  # Ensure target is not in features

# Feature pipeline: log + scale
feature_pipeline = Pipeline([("log", LogTransformer()), ("scaler", StandardScaler())])


target_pipeline = GroupedTransformer(
    Pipeline([("log", LogTransformer()), ("scaler", StandardScaler())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaler())])

preprocessing_configs = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline},
}

## Tuning Hyperparameters

In [10]:
output_length = 10
static_columns = [c for c in static_columns if c not in ["gauge_id"]]


def objective(trial):
    # Define the hyperparameters to tune
    batch_size = trial.suggest_int("batch_size", 16, 128)
    input_length = trial.suggest_int("input_length", 14, 60)
    hidden_size = trial.suggest_int("hidden_size", 32, 256)

    # Create data module with the trial's batch size and input length
    data_module = HydroDataModule(
        time_series_df=ts_data,
        static_df=static_data,
        group_identifier="gauge_id",
        preprocessing_config=preprocessing_configs,
        batch_size=batch_size,  # Use trial's batch size
        input_length=input_length,  # Use trial's input length
        output_length=output_length,
        num_workers=4,
        features=ts_columns,
        static_features=static_columns,
        target="streamflow",
        min_train_years=2,
        val_years=1,
        test_years=3,
        max_missing_pct=10,
    )

    # Create model with trial's hidden size
    model = LitTSMixer(
        input_len=input_length,  # Match data module's input length
        output_len=output_length,
        input_size=len(ts_columns),
        static_size=len(static_columns),
        hidden_size=hidden_size,  # Use trial's hidden size
    )

    # Configure trainer with early stopping
    trainer = pl.Trainer(
        max_epochs=1,  # Keep this low for initial testing
        accelerator="cpu",
        devices=1,
        callbacks=[EarlyStopping(monitor="val_loss", patience=3, mode="min")],
        enable_progress_bar=False,  # Reduce output clutter during optimization
    )

    # Train and get the best validation loss
    trainer.fit(model, data_module)

    return trainer.callback_metrics["val_loss"].item()


# Create a study object and specify the direction of optimization
study = optuna.create_study(direction="minimize")

# Run the optimization
study.optimize(objective, n_trials=10)  # Start with 10 trials for testing

# Print the best parameters and score
print("Best parameters:", study.best_params)
print("Best validation loss:", study.best_value)

# You can also print a summary of the optimization
print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print(
    "  Number of pruned trials: ",
    len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]),
)
print(
    "  Number of complete trials: ",
    len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]),
)

[I 2025-02-19 06:41:57,935] A new study created in memory with name: no-name-41a9d076-6a4d-488a-aa48-d5e35ffd0dc1
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


Original basins: 3
Retained basins: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 851 K  | train
1 | mse_criterion | MSELoss | 0      | train
--------------------------------------------------
851 K     Trainable params
0         Non-trainable params
851 K     Total params
3.404     Total estimated model params size (MB)
41        Modules in train mode
0         Modules in eval mode


Created 13215 valid sequences
Created 606 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:42:10,634] Trial 0 finished with value: 0.06316959112882614 and parameters: {'batch_size': 126, 'input_length': 53, 'hidden_size': 201}. Best is trial 0 with value: 0.06316959112882614.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 564 K 

Original basins: 3
Retained basins: 2
Created 13272 valid sequences
Created 644 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:42:26,462] Trial 1 finished with value: 0.06423867493867874 and parameters: {'batch_size': 23, 'input_length': 34, 'hidden_size': 197}. Best is trial 0 with value: 0.06316959112882614.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 175 K  

Original basins: 3
Retained basins: 2
Created 13316 valid sequences
Created 672 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:42:35,584] Trial 2 finished with value: 0.06703387200832367 and parameters: {'batch_size': 64, 'input_length': 20, 'hidden_size': 94}. Best is trial 0 with value: 0.06316959112882614.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 758 K  |

Original basins: 3
Retained basins: 2
Created 13215 valid sequences
Created 606 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:42:48,666] Trial 3 finished with value: 0.05730516463518143 and parameters: {'batch_size': 58, 'input_length': 53, 'hidden_size': 179}. Best is trial 3 with value: 0.05730516463518143.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 753 K  

Original basins: 3
Retained basins: 2
Created 13209 valid sequences
Created 602 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:43:06,625] Trial 4 finished with value: 0.05534825474023819 and parameters: {'batch_size': 17, 'input_length': 55, 'hidden_size': 172}. Best is trial 4 with value: 0.05534825474023819.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 1.1 M  

Original basins: 3
Retained basins: 2
Created 13209 valid sequences
Created 602 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:43:25,599] Trial 5 finished with value: 0.058484047651290894 and parameters: {'batch_size': 17, 'input_length': 55, 'hidden_size': 247}. Best is trial 4 with value: 0.05534825474023819.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 254 K 

Original basins: 3
Retained basins: 2
Created 13275 valid sequences
Created 646 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:43:38,847] Trial 6 finished with value: 0.05589953437447548 and parameters: {'batch_size': 17, 'input_length': 33, 'hidden_size': 91}. Best is trial 4 with value: 0.05534825474023819.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 701 K  |

Original basins: 3
Retained basins: 2
Created 13203 valid sequences
Created 598 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:43:53,822] Trial 7 finished with value: 0.06362254172563553 and parameters: {'batch_size': 22, 'input_length': 57, 'hidden_size': 155}. Best is trial 4 with value: 0.05534825474023819.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 378 K  

Original basins: 3
Retained basins: 2
Created 13224 valid sequences
Created 612 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:44:03,660] Trial 8 finished with value: 0.06368408352136612 and parameters: {'batch_size': 121, 'input_length': 50, 'hidden_size': 94}. Best is trial 4 with value: 0.05534825474023819.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.processed_static[col] = transformed[:, i]

  | Name          | Type    | Params | Mode 
--------------------------------------------------
0 | model         | TSMixer | 446 K  

Original basins: 3
Retained basins: 2
Created 13320 valid sequences
Created 674 valid sequences


`Trainer.fit` stopped: `max_epochs=1` reached.
[I 2025-02-19 06:44:15,625] Trial 9 finished with value: 0.057196177542209625 and parameters: {'batch_size': 36, 'input_length': 19, 'hidden_size': 250}. Best is trial 4 with value: 0.05534825474023819.


Best parameters: {'batch_size': 17, 'input_length': 55, 'hidden_size': 172}
Best validation loss: 0.05534825474023819
Study statistics: 
  Number of finished trials:  10
  Number of pruned trials:  0
  Number of complete trials:  10
