In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.preprocessing.static_preprocessing import process_static_data, save_static_pipeline, load_static_pipeline

from returns.result import Success
import polars as pl


---

In [3]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = ["streamflow"]

In [4]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/USA/post_processed/shapefiles",
    gauge_id_prefix="USA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()[:20]

caravan_ca.load_stations(basin_ids)

static_data = caravan_ca.get_static_attributes()[static_features + ["gauge_id"]]

In [5]:
static_data

Unnamed: 0,p_mean,area,ele_mt_sav,high_prec_dur,frac_snow,high_prec_freq,slp_dg_sav,cly_pc_sav,aridity_ERA5_LAND,aridity_FAO_PM,gauge_id
0,3.175454,2297.683155,276.198974,1.105263,0.374538,0.047433,41.550101,8.311524,3.763929,0.492557,USA_01013500
1,3.203837,619.102595,103.274219,1.08377,0.33544,0.056674,22.505966,7.313426,4.539418,0.580916,USA_01022500
2,3.184069,3666.943011,174.744464,1.093123,0.31613,0.052225,20.638897,8.365937,4.263635,0.532879,USA_01030500
3,3.289056,764.824149,303.634479,1.093185,0.306273,0.053799,38.904182,8.46036,3.947586,0.510585,USA_01031500
4,3.27561,902.895076,378.958111,1.10043,0.299063,0.052498,67.159417,8.040634,3.694839,0.501584,USA_01047000
5,3.402069,395.44491,644.379447,1.07804,0.368406,0.040657,88.993129,8.0,3.128363,0.462018,USA_01052500
6,3.664167,180.835783,626.0,1.104073,0.374272,0.050103,124.0,8.0,3.001874,0.455642,USA_01054200
7,3.177602,250.426117,548.306359,1.088496,0.386999,0.050513,88.614819,8.0,3.721149,0.518598,USA_01055000
8,3.26927,197.428264,282.385672,1.104278,0.310281,0.056537,67.212435,9.0,4.042607,0.534917,USA_01057000
9,3.193947,32.107003,43.392693,1.101828,0.2446,0.057769,17.241375,6.958592,4.992413,0.624726,USA_01073000


---

In [6]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [7]:
results = process_static_data(
    region_static_attributes_base_dirs={"USA": "/Users/cooper/Desktop/CaravanifyParquet/USA/post_processed/attributes/USA"},
    list_of_gauge_ids=basin_ids,
    preprocessing_config=preprocessing_config,
    output_path="/Users/cooper/Desktop/hydro-forecasting/tests/USA_static_test.parquet",
)

if isinstance(results, Success):
    path_to_static, fitted_pipeline = results.unwrap()
    print(f"Static data saved to {path_to_static}")
else:
    print(f"Error processing static data: {results.failure()}")


Static data saved to /Users/cooper/Desktop/hydro-forecasting/tests/USA_static_test.parquet


In [8]:
test_read = pl.read_parquet(
    path_to_static)

In [9]:
test_read.describe()

statistic,gauge_id,p_mean,area,ele_mt_sav,high_prec_dur,frac_snow,high_prec_freq,slp_dg_sav,cly_pc_sav,aridity_ERA5_LAND,aridity_FAO_PM
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""20""",20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,-5.5067e-15,-1.1102e-16,1.3323e-16,-7.8382e-15,1.0436e-15,1.0436e-15,-3.7748e-16,8.8818e-16,-5.9952e-16,-1.6209e-15
"""std""",,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"""min""","""USA_01013500""",-1.044237,-0.643325,-1.567372,-1.391677,-2.248621,-1.819549,-1.416819,-1.496782,-1.44635,-1.341857
"""25%""",,-0.599657,-0.557057,-0.854668,-0.725535,-0.290942,-0.804096,-0.787128,-0.721697,-0.526026,-0.609332
"""50%""",,-0.354345,-0.403738,0.094028,0.034842,0.140249,0.175933,0.130528,0.053388,-0.327021,-0.142608
"""75%""",,0.043317,0.003896,0.513199,0.342738,0.585141,0.919811,0.758958,0.410206,0.526425,0.655574
"""max""","""USA_01144000""",2.579207,3.26227,1.879575,2.159251,1.371585,1.592843,1.837654,2.361826,1.882815,2.075727


In [10]:
fitted_pipeline

In [11]:
save_results = save_static_pipeline(
    pipeline=fitted_pipeline,
    filepath="/Users/cooper/Desktop/hydro-forecasting/tests/USA_static_test_pipeline.joblib",
)

if isinstance(save_results, Success):
    print(f"Pipeline saved to {save_results.unwrap()}")
else:
    print(f"Error saving pipeline: {save_results.failure()}")

Pipeline saved to /Users/cooper/Desktop/hydro-forecasting/tests/USA_static_test_pipeline.joblib


In [None]:
load_results = load_static_pipeline(
    filepath="/Users/cooper/Desktop/hydro-forecasting/tests/USA_static_test_pipeline.joblib",
)
if isinstance(load_results, Success):
    loaded_pipeline = load_results.unwrap()
    print(f"Pipeline loaded from {load_results.unwrap()}")
else:
    print(f"Error loading pipeline: {load_results.failure()}")

Pipeline loaded from Pipeline(steps=[('scaler', StandardScaleTransformer())])


In [13]:
loaded_pipeline