In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import time

In [2]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import torch
from torch.nn import MSELoss
from torch.optim import Adam
import matplotlib.pyplot as plt
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import glob
from pathlib import Path

from src.data_models.camels_ch import CamelsCH, CamelsCHConfig, get_all_gauge_ids
from src.data_models.dataset import HydroDataset
from src.data_models.preprocessing import (
    scale_time_series,
    scale_static_attributes,
    inverse_scale_static_attributes,
    inverse_scale_time_series,
)
from src.data_models.caravanify import Caravanify, CaravanifyConfig 

from utils.metrics import nash_sutcliffe_efficiency
from src.data_models.datamodule import HydroDataModule

---

## Testing Caravanify

In [3]:
config = CaravanifyConfig(
    attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CH/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CH/post_processed/timeseries/csv",
    gauge_id_prefix="CH",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)



caravan = Caravanify(config)
ids_for_training = caravan.get_all_gauge_ids()

print(f"Total number of stations: {len(ids_for_training)}")

caravan.load_stations(ids_for_training)


# Get data
ts_data = caravan.get_time_series()  # MultiIndex: (gauge_id, date)
static_data = caravan.get_static_attributes()  # Columns merged from enabled attributes

Total number of stations: 135


In [None]:
ts_columns = ts_data.columns
ts_columns = [col for col in list(ts_columns) if col not in ["gauge_id", "date"]]

In [None]:
statics_to_keep = [
    "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

static_columns = static_data.columns
static_columns = [col for col in list(static_columns) if col in statics_to_keep]

static_data = static_data[static_columns]
static_data

# 1. Load and prepare CAMELS-CH data

In [None]:
# camels_config = CamelsCHConfig(
#     timeseries_dir="/Users/cooper/Desktop/CAMELS-CH/data/timeseries/observation_based/",
#     timeseries_pattern="CAMELS_CH_obs_based_*.csv",
#     static_attributes_dir="/Users/cooper/Desktop/CAMELS-CH/data/static_attributes",
#     use_climate=False,
#     use_geology=False,
#     use_glacier=False,
#     use_human_influence=False,
#     use_hydrogeology=False,
#     use_hydrology=False,
#     use_landcover=False,
#     use_soil=False,
#     use_topographic=False,
# )

In [None]:
# all_gauge_ids = get_all_gauge_ids(camels_config)

# ids_for_training = all_gauge_ids[:5]

# camels = CamelsCH(camels_config)
# camels.load_stations(ids_for_training)

In [None]:
# data = camels.get_time_series()
# data = data[
#     [
#         "gauge_id",
#         "date",
#         "discharge_spec(mm/d)",
#     ]
# ]

# data

In [None]:
# static = camels.get_static_attributes()
# sc = static.columns

# # for i in range(len(sc)):
# #     print(f"{i}: {sc[i]}")
# static_attributes = [
#     "gauge_id",
#     "area", 
#     "elev_mean",  
#     "slope_mean",  
#     "aridity",  
#     "p_seasonality",  
#     "frac_snow",  
#     "porosity",  
#     "conductivity",  
#     "p_mean",  
#     "geo_porosity",  
# ]
# static = static[static_attributes]
# static

# 2. Configure preprocessing

In [None]:
preprocessing_config = {
    "features": {
        "scale_method": "per_basin",
        "log_transform": []
    },
    "target": {
        "scale_method": "per_basin",
        "log_transform": False
    },
    "static_features": {
        "scale_method": "global"
    }
}

# 3. Create DataModule

In [None]:
output_length = 10


data_module = HydroDataModule(
    time_series_df=ts_data,
    static_df=static_data,
    group_identifier="gauge_id",
    preprocessing_config=preprocessing_config,
    batch_size=32,
    input_length=30,
    output_length=output_length,
    num_workers=4,
    features=ts_columns,
    static_features=static_columns,
    target="streamflow",
    train_years=15,
    val_years=3,
    min_test_years=6,
)

In [None]:
data_module.target

## 4. Create model and train

In [None]:
from src.models.lstm import LitLSTM
from src.models.ealstm import LitEALSTM
from torch.optim import Adam
from torch.nn import MSELoss

# 5. Evalue and plot results

In [None]:
# model = LitLSTM(
#     input_size=len(ts_columns),
#     hidden_size=16,
#     num_layers=1,
#     output_size=output_length,
#     target=data_module.target,
# )

model = LitEALSTM(
    input_size_dyn=len(ts_columns),
    input_size_stat=len(static_columns) - 1,
    hidden_size=64,
    output_size=output_length,
    target=data_module.target,
)

# Configure trainer
trainer = pl.Trainer(
    max_epochs=5,
    accelerator="cpu",
    devices=1,
    callbacks=[
        ModelCheckpoint(
            monitor="val_loss",
            dirpath="checkpoints",
            filename="best-checkpoint",
            save_top_k=1,
            mode="min",
        ),
        EarlyStopping(monitor="val_loss", patience=3, mode="min"),
    ],
)

# Train the model
trainer.fit(model, data_module)

In [None]:
quality_report = data_module.quality_report

excluded_basins = list(quality_report["excluded_basins"].keys())
excluded_basins

ids_for_training = [id for id in ids_for_training if id not in excluded_basins]

In [None]:
ids_for_training

In [None]:
trainer.test(model, data_module)
test_results = model.test_results

In [None]:
# Get the results
results_df = model.test_results["forecast_df"]
horizon_metrics = model.test_results["horizon_metrics"]

horizons = []
nse_values = []
for horizon, metrics in horizon_metrics.items():
    horizons.append(horizon)
    nse_values.append(metrics["NSE"])

# Create bar plot
plt.figure(figsize=(10, 6))
colors = sns.color_palette("Blues", 1)
plt.bar(horizons, nse_values, color=colors)

# Customize plot
plt.xlabel("Forecast Horizon (Days)")
plt.ylabel("Nash-Sutcliffe Efficiency")
plt.title("Forecast Skill by Prediction Horizon")
plt.grid(True, linestyle="--", alpha=0.3)
sns.despine()

# Set x-axis ticks to show all horizons
plt.xticks(horizons)

# Add value labels on top of each bar
for i, v in enumerate(nse_values):
    plt.text(i + 1, v, f"{v:.3f}", ha="center", va="bottom")

plt.tight_layout()
plt.show()

In [None]:
results_df.head(10)

In [None]:
# Process results_df to get NSE by basin and horizon
basin_metrics = {}
for basin in results_df["basin_id"].unique():
    basin_data = results_df[results_df["basin_id"] == basin]
    nse_values = []
    for horizon in range(1, max(basin_data["horizon"]) + 1):
        horizon_data = basin_data[basin_data["horizon"] == horizon]
        nse = nash_sutcliffe_efficiency(
            horizon_data["observed"].values, horizon_data["prediction"].values
        )
        nse_values.append(nse)
    basin_metrics[basin] = nse_values

# Sort basins by NSE at horizon 1
sorted_basins = sorted(
    basin_metrics.keys(), key=lambda x: basin_metrics[x][0], reverse=True
)
basin_metrics = {basin: basin_metrics[basin] for basin in sorted_basins}

# Plot settings
plt.figure(figsize=(12, 6))
bar_width = 0.8 / len(basin_metrics)

# Create color palette of blue shades
colors = sns.color_palette("Blues", len(basin_metrics) + 2)[2:]

# Create bars for each basin
for i, (basin, nse_values) in enumerate(basin_metrics.items()):
    x = np.arange(len(nse_values)) + i * bar_width
    plt.bar(x, nse_values, bar_width, label=f"Basin {basin}", color=colors[i])

# Customize plot
plt.xlabel("Forecast Horizon (Days)", fontsize=12)
plt.ylabel("Nash-Sutcliffe Efficiency", fontsize=12)
plt.title("Forecast Skill by Basin and Horizon", fontsize=14, pad=20)
plt.grid(True, linestyle="--", alpha=0.3)
plt.legend(title="Basin ID", title_fontsize=10, fontsize=10)
sns.despine()

# Set x-axis ticks in middle of grouped bars
plt.xticks(
    np.arange(len(next(iter(basin_metrics.values()))))
    + bar_width * (len(basin_metrics) - 1) / 2,
    np.arange(1, len(next(iter(basin_metrics.values()))) + 1),
)

# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_predictions(results_df, n_timesteps=None):
    # Filter for horizon 1
    horizon_1_data = results_df[results_df["horizon"] == 1]

    if n_timesteps:
        # Get last n_timesteps for each basin
        horizon_1_data = (
            horizon_1_data.groupby("basin_id").tail(n_timesteps).reset_index(drop=True)
        )

    n_basins = len(horizon_1_data["basin_id"].unique())
    n_cols = 2
    n_rows = (n_basins + 1) // 2

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten()

    for idx, basin in enumerate(horizon_1_data["basin_id"].unique()):
        basin_data = horizon_1_data[horizon_1_data["basin_id"] == basin]
        ax = axes[idx]

        nse = nash_sutcliffe_efficiency(
            basin_data["observed"].values, basin_data["prediction"].values
        )

        x = np.arange(len(basin_data))
        ax.plot(x, basin_data["observed"], label="Observed", color="#1d4ed8")
        ax.plot(
            x,
            basin_data["prediction"],
            label="Predicted",
            color="#dc2626",
            alpha=0.8,
        )

        ax.set_title(f"Basin {basin} (NSE: {nse:.3f})", fontsize=12)
        ax.set_xlabel("Time Step", fontsize=10)
        ax.set_ylabel("Discharge", fontsize=10)
        ax.grid(True, linestyle="--", alpha=0.3)
        ax.legend(fontsize=9)
        sns.despine(ax=ax)

    for idx in range(n_basins, len(axes)):
        fig.delaxes(axes[idx])

    plt.tight_layout()
    plt.show()


# Example usage:
plot_predictions(results_df, n_timesteps=365) 