In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.data.preprocessing import run_hydro_processor

from returns.result import Failure, Success, Result


---

In [3]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:10]



In [4]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = ["streamflow"]

In [5]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [6]:
region_time_series_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv/USA",
}

region_static_attributes_base_dirs = {
    "CA": "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    "USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes/USA",
}

datamodule_config = {"test": "test"}

# def run_hydro_processor(
#     region_time_series_base_dirs: dict[str, Path],
#     region_static_attributes_base_dirs: dict[str, Path],
#     path_to_preprocessing_output_directory: Union[str, Path],
#     required_columns: list[str],
#     run_uuid: str,
#     datamodule_config: dict[str, Any],
#     preprocessing_config: dict[str, dict[str, GroupedPipeline | Pipeline]],
#     min_train_years: float = 5.0,
#     max_imputation_gap_size: int = 5,
#     group_identifier: str = "gauge_id",
#     train_prop: float = 0.25,
#     val_prop: float = 0.25,
#     test_prop: float = 0.25,
#     list_of_gauge_ids_to_process: Optional[list[str]] = None,
#     basin_batch_size: int = 50,
# ) -> Result[ProcessingOutput, str]:


run_results = run_hydro_processor(
    region_time_series_base_dirs=region_time_series_base_dirs,
    region_static_attributes_base_dirs=region_static_attributes_base_dirs,
    path_to_preprocessing_output_directory=Path(
        "/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor"
    ),
    required_columns=forcing_features + target,
    run_uuid="000001",
    datamodule_config=datamodule_config,
    preprocessing_config=preprocessing_config,
    min_train_years=5.0,
    max_imputation_gap_size=5,
    group_identifier="gauge_id",
    train_prop=0.5,
    val_prop=0.25,
    test_prop=0.25,
    list_of_gauge_ids_to_process=basin_ids,
    basin_batch_size=50,
)

if isinstance(run_results, Failure):
    print("Failed to run hydro processor")
    print(run_results.failure())
elif isinstance(run_results, Success):
    print("Successfully ran hydro processor")
    processing_output = run_results.unwrap()

SUCCESS: Config saved to /Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/config.json
SUCCESS: Static features saved to /Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/processed_static_features.parquet
SUCCESS: Static features pipeline saved to /Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/fitted_static_pipeline.joblib
INFO: Processed 10 basins, 9 passed quality checks
SUCCESS: Preprocessing completed successfully. Output at /Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001
Successfully ran hydro processor


In [7]:
print(processing_output)

ProcessingOutput(run_output_dir=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001'), processed_timeseries_dir=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/processed_time_series'), processed_static_attributes_path=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/processed_static_features.parquet'), fitted_time_series_pipelines_path=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/fitted_time_series_pipelines.joblib'), fitted_static_pipeline_path=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/fitted_static_pipeline.joblib'), quality_reports_dir=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/quality_reports'), summary_quality_report_path=PosixPath('/Users/cooper/Desktop/hydro-forecasting/tests/testing_run_hydro_processor/000001/