In [None]:
# !jupyter lab build

In [None]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent.parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

In [None]:
from sklearn.pipeline import Pipeline

from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.experiment_utils.hyperparameter_tune_model import hyperparameter_tune_model
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.normalize import NormalizeTransformer
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer

---

## Experiment constants

In [None]:
REGIONS = [
    "CA"
]

COUNTRY = "tajikistan"

## Loading the data (as gauge_ids)

In [None]:
def load_basin_ids(country: str) -> list[str]:
    """
    Function to load basins for a given country in Central Asia
    """
    # Make country lowercase and make the first letter uppercase
    country = country.lower()
    country = country.capitalize()

    if country != "Tajikistan" and country != "Kyrgyzstan":
        print("Country not supported")
        return []

    configs = CaravanifyParquetConfig(
        attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
        timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
        gauge_id_prefix="CA",
        use_hydroatlas_attributes=True,
        use_caravan_attributes=True,
        use_other_attributes=True,
    )

    caravan = CaravanifyParquet(configs)
    ca_basins = caravan.get_all_gauge_ids()
    caravan.load_stations(ca_basins)
    static_data = caravan.get_static_attributes()

    return list(static_data[static_data["country"] == country]["gauge_id"].unique())

basin_ids = load_basin_ids(COUNTRY)

## Datamodule Configs

In [None]:
region_time_series_base_dirs = {
    region: f"/Users/cooper/Desktop/CaravanifyParquet/{region}/post_processed/timeseries/csv/{region}"
    for region in REGIONS
}

region_static_attributes_base_dirs = {
    region: f"/Users/cooper/Desktop/CaravanifyParquet/{region}/post_processed/attributes/{region}" for region in REGIONS
}

path_to_preprocessing_output_directory = (
    f"/Users/cooper/Desktop/hydro-forecasting/experiments/HPT/data_cache/{COUNTRY}"
)

In [None]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = "streamflow"

In [None]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer()), ("normalizer", NormalizeTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [None]:
datamodule_config = {
    "output_length": 10,
    "region_time_series_base_dirs": region_time_series_base_dirs,
    "region_static_attributes_base_dirs": region_static_attributes_base_dirs,
    "path_to_preprocessing_output_directory": path_to_preprocessing_output_directory,
    "group_identifier": "gauge_id",
    "batch_size": 2048,
    "forcing_features": forcing_features,
    "static_features": static_features,
    "target": target,
    "num_workers": 4,
    "min_train_years": 5,
    "train_prop": 0.5,
    "val_prop": 0.25,
    "test_prop": 0.25,
    "max_imputation_gap_size": 5,
    "chunk_size": 100,
    "validation_chunk_size": 100,
    "is_autoregressive": True,
    "preprocessing_configs": preprocessing_config,
}

## Training Configs

In [None]:
training_config = {
    "max_epochs": 200,
    "accelerator": "mps",
    "devices": 1,
    "early_stopping_patience": 15,
    "reload_dataloaders_every_n_epochs": False,
}

## Remaining Configs

In [None]:
output_dir = "/Users/cooper/Desktop/hydro-forecasting/experiments/HPT"
model_type = "tide"
experiment_name = f"HPT_{model_type}_{COUNTRY}"
num_trials = 5

## Training the models from scratch

In [None]:
hpt_study = hyperparameter_tune_model(
    model_type=model_type,
    gauge_ids=basin_ids,
    datamodule_config=datamodule_config,
    training_config=training_config,
    output_dir_study=output_dir,
    experiment_name=experiment_name,
    n_trials=num_trials,
    search_spaces_dir="/Users/cooper/Desktop/hydro-forecasting/experiments/HPT/search-spaces",
)

In [None]:
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import yaml
from pathlib import Path

# Set your parameters here - corrected path and study name
db_path = "/Users/cooper/Desktop/hydro-forecasting/experiments/HPT/HPT_tide_tajikistan/HPT_tide_tajikistan_study.db"
study_name = "HPT_tide_tajikistan"  # Use exact name from logs
output_yaml = "/Users/cooper/Desktop/hydro-forecasting/experiments/HPT/best_params.yaml"


# Define analysis function
def analyze_optuna_study(storage_path, study_name):
    """Analyze an Optuna study and return details about the best trial."""
    # Format the storage URL if needed
    storage_url = f"sqlite:///{storage_path}" if not storage_path.startswith("sqlite:///") else storage_path

    # Load the study
    print(f"Loading study '{study_name}' from {storage_url}")
    study = optuna.load_study(study_name=study_name, storage=storage_url)

    # Show summary
    print(f"\nStudy Summary:")
    print(f"  Number of trials: {len(study.trials)}")
    print(f"  Best value: {study.best_value}")
    print(f"  Direction: {study.direction}")

    if not study.trials:
        print("No trials found in study.")
        return None, None, None

    # Get the best trial
    best_trial = study.best_trial
    print(f"\nBest Trial (#{best_trial.number}):")
    print(f"  Value: {best_trial.value}")

    # Display best hyperparameters
    print("\nBest Hyperparameters:")
    for param_name, param_value in best_trial.params.items():
        print(f"  {param_name}: {param_value}")

    # Get model configuration
    model_config = best_trial.params.copy()

    # Add user attributes if they exist
    if hasattr(best_trial, "user_attrs"):
        if "model_config" in best_trial.user_attrs:
            model_config = best_trial.user_attrs["model_config"]
        elif "hparams" in best_trial.user_attrs:
            model_config = best_trial.user_attrs["hparams"]

    return study, best_trial, model_config


# Run the analysis
study, best_trial, best_config = analyze_optuna_study(db_path, study_name)

# Save to YAML if needed
if best_config:
    yaml_path = Path(output_yaml)
    print(f"\nSaving best model config to {yaml_path}")
    with open(yaml_path, "w") as f:
        yaml.dump(best_config, f)

# Plot the optimization history if there are enough trials
if study and len(study.trials) > 1:
    # Get dataframe of all trials
    df = study.trials_dataframe()

    # Plot optimization history
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.set_title("Optimization History")
    ax.set_xlabel("Trial Number")
    ax.set_ylabel("Objective Value (val_loss)")
    ax.plot(df["number"], df["value"], "o-")
    plt.show()

    # Try to plot parameter importances if more than 5 trials
    if len(study.trials) >= 5:
        try:
            importances = optuna.importance.get_param_importances(study)
            importance_df = pd.DataFrame(importances.items(), columns=["Parameter", "Importance"])
            importance_df = importance_df.sort_values("Importance", ascending=False)

            fig, ax = plt.subplots(figsize=(10, 6))
            ax.barh(importance_df["Parameter"], importance_df["Importance"])
            ax.set_title("Parameter Importances")
            ax.set_xlabel("Importance")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"Could not calculate parameter importances: {e}")
else:
    print("\nNot enough trials to plot optimization history")