In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import time

In [2]:
import torch
import lightning.pytorch as pl
from lightning.pytorch import Trainer
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_forecasting.data import GroupNormalizer
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
from io import StringIO
import numpy as np
import glob 
from pathlib import Path

from src.benchmark_tft.data_loading import combine_camels_data
from src.data_models.camels_ch import CamelsCH, CamelsCHConfig, get_all_gauge_ids

  from tqdm.autonotebook import tqdm


---

## Getting the data

In [3]:
camels_config = CamelsCHConfig(
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/timeseries/observation_based/",
    timeseries_pattern="CAMELS_CH_obs_based_*.csv",
    static_attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/static_attributes",
    use_climate=False,
    use_geology=False,
    use_glacier=False,
    use_human_influence=False,
    use_hydrogeology=False,
    use_hydrology=False,
    use_landcover=False,
    use_soil=False,
    use_topographic=False,
)

# gauge_ids = get_all_gauge_ids(camels_config)

# print(f"There are {len(gauge_ids)} gauge ids")

camels = CamelsCH(camels_config)
camels.load_stations(["2486"])

Loaded time series data for 1 stations


In [4]:
data = camels.get_time_series()
data = data[
    [
        "date",
        "discharge_spec(mm/d)",
        "precipitation(mm/d)",
        "temperature_mean(degC)",
        "gauge_id",
    ]
]

data

Unnamed: 0,date,discharge_spec(mm/d),precipitation(mm/d),temperature_mean(degC),gauge_id
0,1981-01-01,,2.47,-0.97,2486
1,1981-01-02,,1.46,-2.45,2486
2,1981-01-03,,28.91,1.15,2486
3,1981-01-04,,23.48,0.84,2486
4,1981-01-05,,7.89,-5.07,2486
...,...,...,...,...,...
14605,2020-12-27,2.240,2.66,-2.03,2486
14606,2020-12-28,2.048,6.02,-0.45,2486
14607,2020-12-29,1.725,3.41,-0.04,2486
14608,2020-12-30,1.587,2.92,-1.99,2486


In [5]:
data = data.dropna(subset=["discharge_spec(mm/d)"])

data.loc[:, "precipitation(mm/d)"] = data["precipitation(mm/d)"].fillna(0)

data.loc[:, "temperature_mean(degC)"] = data["temperature_mean(degC)"].fillna(
    data["temperature_mean(degC)"].mean()
)

In [6]:
data.loc[:, "time_idx"] = data["date"].rank(method="dense").astype(int) - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, "time_idx"] = data["date"].rank(method="dense").astype(int) - 1


## Preparing the data

In [7]:
max_encoder_length = 365
max_prediction_length = 1

training_cutoff = data["time_idx"].max() - max_prediction_length * 365 

training = TimeSeriesDataSet(
   data[lambda x: x["time_idx"] <= training_cutoff],
   time_idx="time_idx",
   target="discharge_spec(mm/d)", 
   group_ids=["gauge_id"],
   max_encoder_length=max_encoder_length,
   min_encoder_length=max_encoder_length // 2,
   max_prediction_length=max_prediction_length,
   min_prediction_length=1,
   time_varying_known_reals=["precipitation(mm/d)", "temperature_mean(degC)"],
   time_varying_unknown_reals=["discharge_spec(mm/d)"],
   target_normalizer=GroupNormalizer(groups=["gauge_id"]),
   add_relative_time_idx=True,
   add_target_scales=True,
   add_encoder_length=True,
   allow_missing_timesteps=True
)

# Create validation set using last max_prediction_length timesteps
validation = TimeSeriesDataSet.from_dataset(
   training,
   data[lambda x: x["time_idx"] > training_cutoff],
   predict=True,
   stop_randomization=True
)

batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

In [8]:
callbacks = [
    EarlyStopping(monitor="val_loss", patience=3, mode="min"),
    ModelCheckpoint(
        monitor="val_loss",
        dirpath="checkpoints",
        filename="tft-{epoch:02d}-{val_loss:.2f}",
        save_top_k=3,
        mode="min",
    ),
]

trainer = Trainer(
    max_epochs=30,
    accelerator="cpu",
    devices=[0] if torch.cuda.is_available() else 1,
    gradient_clip_val=0.1,
    limit_train_batches=50,
    enable_checkpointing=True,
    logger=True,
    callbacks=callbacks,
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [9]:
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,
    loss=QuantileLoss(),
    log_interval=10,
    reduce_on_plateau_patience=4,
    optimizer="adam",
)

trainer.fit(tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/cooper/Desktop/CAMELS-CH/notebooks/checkpoints exists and is not empty.

   | Name                               | Type                            | Params | Mode 
----------------------------------------------------------------------------------------------

                                                                           

/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Epoch 7: 100%|██████████| 50/50 [00:18<00:00,  2.74it/s, v_num=22, train_loss_step=0.308, val_loss=0.0838, train_loss_epoch=0.401]


In [14]:
best_model_path = "/Users/cooper/Desktop/CAMELS-CH/notebooks/checkpoints/tft-epoch=00-val_loss=0.16.ckpt"

hindcast_cutoff = data["time_idx"].max() - max_prediction_length * 365

# Load the best model and set to eval mode
best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
best_model.eval()

# Evaluate on a validation/test dataset (built via from_dataset)
predictions = best_model.predict(val_dataloader, return_y=True)
# Compute a metric (e.g., SMAPE)
smape = SMAPE()(predictions.output, predictions.y)
print("SMAPE:", smape.item())

# Hindcast example: create a hindcast dataset (using predict_mode=True ensures only the last forecast point is used)
hindcast_dataset = TimeSeriesDataSet.from_dataset(
    training,
    data[lambda x: x["time_idx"] > hindcast_cutoff],
    predict=True,
    stop_randomization=True
)
hindcast_dataloader = hindcast_dataset.to_dataloader(train=False, batch_size=128)
hindcast_predictions = best_model.predict(hindcast_dataloader)
# You can then compare hindcast_predictions with the known historical targets
best_model.plot_prediction(hindcast_predictions.x, hindcast_predictions.output, idx=0)


/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/cooper/Desktop/CAMELS-CH/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in th

SMAPE: 1000000000.0


AttributeError: 'Tensor' object has no attribute 'x'