In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import time

In [2]:
import torch
import lightning.pytorch as pl
from lightning.pytorch import Trainer
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_forecasting.data import GroupNormalizer
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
from io import StringIO
import numpy as np
import glob
from pathlib import Path

from src.data_models.camels_ch import CamelsCH, CamelsCHConfig, get_all_gauge_ids
from src.data_models.dataset import HydroDataset
from src.data_models.preprocessing import (
    scale_time_series,
    scale_static_attributes,
    inverse_scale_static_attributes,
)

  from tqdm.autonotebook import tqdm


---

## Getting the data

In [3]:
camels_config = CamelsCHConfig(
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/timeseries/observation_based/",
    timeseries_pattern="CAMELS_CH_obs_based_*.csv",
    static_attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/static_attributes",
    use_climate=False,
    use_geology=False,
    use_glacier=False,
    use_human_influence=False,
    use_hydrogeology=False,
    use_hydrology=True,
    use_landcover=False,
    use_soil=False,
    use_topographic=False,
)

camels = CamelsCH(camels_config)
camels.load_stations(["2018", "2019", "2020"])

Loaded time series data for 3 stations
Loading hydrology attributes
Loaded static attributes for 3 stations


In [4]:
static = camels.get_static_attributes()
# q_mean	runoff_ratio	stream_elas	slope_fdc	baseflow_index_landson	hfd_mean
static = static[["gauge_id", "q_mean", "runoff_ratio", "stream_elas", "slope_fdc", "baseflow_index_landson", "hfd_mean"]]
static

Unnamed: 0,gauge_id,q_mean,runoff_ratio,stream_elas,slope_fdc,baseflow_index_landson,hfd_mean
0,2018,3.556,0.817,1.045,1.781,0.767,237.41
1,2019,5.671,1.119,0.589,2.332,0.754,255.538
2,2020,3.692,0.811,1.013,1.694,0.683,223.718


## Preprocessing

In [5]:
data = camels.get_time_series()
data = data[
    [
        "gauge_id",
        "date",
        "discharge_spec(mm/d)",
        "precipitation(mm/d)",
        "temperature_mean(degC)",
    ]
]

# Split into train/test
df_train = data.iloc[:240]  # 80 samples per basin
df_test = data.iloc[240:]  # 20 samples per basin

# Scale by basin
scaled_train, scaled_test, params = scale_time_series(
    df_train,
    df_test,
    features=["discharge_spec(mm/d)", "precipitation(mm/d)", "temperature_mean(degC)"],
    by_basin=True,  # Set False for global scaling
)

In [6]:
df_train.head() 

Unnamed: 0,gauge_id,date,discharge_spec(mm/d),precipitation(mm/d),temperature_mean(degC)
0,2018,1981-01-01,1.303,4.27,-1.91
1,2018,1981-01-02,1.307,8.07,-3.47
2,2018,1981-01-03,1.354,26.46,1.05
3,2018,1981-01-04,3.062,31.43,-1.67
4,2018,1981-01-05,2.549,6.18,-6.32


In [7]:
scaled_train.head()

Unnamed: 0,gauge_id,date,discharge_spec(mm/d),precipitation(mm/d),temperature_mean(degC)
0,2018,1981-01-01,-1.42615,-0.055622,-0.945363
1,2018,1981-01-02,-1.424359,0.403326,-1.154683
2,2018,1981-01-03,-1.403311,2.624393,-0.548191
3,2018,1981-01-04,-0.63842,3.224649,-0.91316
4,2018,1981-01-05,-0.868156,0.17506,-1.537095


In [8]:
attributes = [col for col in static.columns if col != 'gauge_id']

# Scale the attributes
scaled_df, scaling_params = scale_static_attributes(static, attributes)

# Add back gauge_id if needed
scaled_df['gauge_id'] = static['gauge_id']

# Inverse scale to verify
original_df = inverse_scale_static_attributes(scaled_df[attributes], scaling_params)
original_df['gauge_id'] = static['gauge_id']

# Print results
print("\nOriginal values:")
print(static)
print("\nScaled values:")
print(scaled_df)
print("\nInverse scaled values:")
print(original_df)

# Verify the scaling worked correctly
np.testing.assert_array_almost_equal(
    static[attributes].values,
    original_df[attributes].values
)
print("\nVerification passed: Original and inverse-scaled values match!")


Original values:
  gauge_id  q_mean  runoff_ratio  stream_elas  slope_fdc  \
0     2018   3.556         0.817        1.045      1.781   
1     2019   5.671         1.119        0.589      2.332   
2     2020   3.692         0.811        1.013      1.694   

   baseflow_index_landson  hfd_mean  
0                   0.767   237.410  
1                   0.754   255.538  
2                   0.683   223.718  

Scaled values:
   baseflow_index_landson  hfd_mean    q_mean  runoff_ratio  slope_fdc  \
0               -0.457515  2.215087 -0.426016     -0.456950  -0.446063   
1               -0.457662  2.419821 -0.402130     -0.453539  -0.439840   
2               -0.458463  2.060452 -0.424480     -0.457018  -0.447045   

   stream_elas gauge_id  
0    -0.454375     2018  
1    -0.459525     2019  
2    -0.454737     2020  

Inverse scaled values:
   baseflow_index_landson  hfd_mean  q_mean  runoff_ratio  slope_fdc  \
0                   0.767   237.410   3.556         0.817      1.781   
1   

## Wrap the data in a PyTorch dataset

In [None]:
# Create dataset
dataset = HydroDataset(
    time_series_df=camels.get_time_series(),
    static_df=camels.get_static_attributes(),
    input_length=365,
    output_length=5,
    features=["discharge_spec(mm/d)", "precipitation(mm/d)", "temperature_mean(degC)"],
    target="discharge_spec(mm/d)",
    static_features=["elev_mean", "slope_mean"],
)

# Test dataset
print(f"Dataset size: {len(dataset)}")
sample = dataset[0]
print("\nSample shapes:")
print(f"X: {sample['X'].shape}")
print(f"y: {sample['y'].shape}")
print(f"static: {sample['static'].shape}")
print(f"gauge_id: {sample['gauge_id']}")

In [None]:
sample['static']

## Make the dataset iterable by creating a DataLoader