In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from sklearn.pipeline import Pipeline

from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.experiment_utils.finetune_pretrained_model import finetune_pretrained_models
from hydro_forecasting.experiment_utils.train_model_from_scratch import train_model_from_scratch
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.normalize import NormalizeTransformer
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer

---

## Loading the data (as gauge_ids)

In [3]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:10]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

In [4]:
ts = caravan_ca.get_time_series()
print(ts)

       gauge_id       date  snow_depth_water_equivalent_mean  \
0      CA_15013 2000-01-02                        130.250000   
1      CA_15013 2000-01-03                        144.619995   
2      CA_15013 2000-01-04                        167.690002   
3      CA_15013 2000-01-05                        176.240005   
4      CA_15013 2000-01-06                        176.330002   
...         ...        ...                               ...   
91305  CA_15044 2024-12-27                         68.959999   
91306  CA_15044 2024-12-28                         68.970001   
91307  CA_15044 2024-12-29                         68.980003   
91308  CA_15044 2024-12-30                         69.209999   
91309  CA_15044 2024-12-31                         70.550003   

       surface_net_solar_radiation_mean  surface_net_thermal_radiation_mean  \
0                             15.010000                           -9.280000   
1                             10.590000                          -11.7600

## Datamodule Configs

In [None]:
region_time_series_base_dirs = {"CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA"}

region_static_attributes_base_dirs = {"CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA"}

path_to_preprocessing_output_directory = "/Users/cooper/Desktop/hydro-forecasting/tests/yolo_7"

In [None]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = "streamflow"

In [None]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [None]:
datamodule_config = {
    "region_time_series_base_dirs": region_time_series_base_dirs,
    "region_static_attributes_base_dirs": region_static_attributes_base_dirs,
    "path_to_preprocessing_output_directory": path_to_preprocessing_output_directory,
    "group_identifier": "gauge_id",
    "batch_size": 2048,
    "forcing_features": forcing_features,
    "static_features": static_features,
    "target": target,
    "num_workers": 4,
    "min_train_years": 5,
    "train_prop": 0.5,
    "val_prop": 0.25,
    "test_prop": 0.25,
    "max_imputation_gap_size": 5,
    "chunk_size": 10,
    "is_autoregressive": True,
    "preprocessing_configs": preprocessing_config,
}

## Training Configs

In [None]:
training_config = {
    "max_epochs": 5,
    "accelerator": "mps",
    "devices": 1,
    "precision": 16,
    "early_stopping_patience": 10,
    "reload_dataloaders_every_n_epochs": 1,
}

## Remaining Configs

In [None]:
output_dir = "/Users/cooper/Desktop/hydro-forecasting/tests/yolo_7"
model_types = ["tide", "ealstm"]
yaml_paths = [
    "/Users/cooper/Desktop/hydro-forecasting/experiments/TiDE_on_low_medium/yaml_files/tide.yaml",
    "/Users/cooper/Desktop/hydro-forecasting/experiments/TiDE_on_low_medium/yaml_files/ealstm.yaml",
]
experiment_name = "first_test"
num_runs = 2
override_previous_attempts = False

## Training the models from scratch

In [None]:
train_results = train_model_from_scratch(
    gauge_ids=basin_ids,
    datamodule_config=datamodule_config,
    training_config=training_config,
    output_dir=output_dir,
    model_types=model_types,
    yaml_paths=yaml_paths,
    experiment_name=experiment_name,
    num_runs=num_runs,
    override_previous_attempts=override_previous_attempts,
)

In [None]:
train_results = train_results.unwrap()

In [None]:
train_results

In [None]:
# def finetune_pretrained_models(
#     # --- Pretrained Model Identification ---
#     gauge_ids: list[str],
#     pretrained_experiment_output_dir: str | Path,
#     model_types: list[str],
#     pretrained_yaml_paths: list[str | Path] | str | Path,
#     datamodule_config: dict[str, Any],
#     training_config: dict[str, Any],
#     output_dir: str | Path,
#     experiment_name: str,
#     select_best_from_pretrained: bool = True,
#     pretrained_run_index: int | None = None,
#     pretrained_attempt_index: int | None = None,
#     lr_reduction_factor: float = 10.0,
#     num_runs: int = 1,
#     base_seed: int = 42,
#     override_previous_attempts: bool = False,

finetune_results = finetune_pretrained_models(
    gauge_ids=basin_ids,
    pretrained_experiment_output_dir=output_dir + "/first_test/checkpoints",
    model_types=model_types,
    pretrained_yaml_paths=yaml_paths,
    datamodule_config=datamodule_config,
    training_config=training_config,
    output_dir=output_dir,
    experiment_name="finetune_test",
    select_best_from_pretrained=True,
    lr_reduction_factor=10.0,
    num_runs=1,
    base_seed=42,
    override_previous_attempts=False,
)