In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent  
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")


Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
import pandas as pd

from hydro_forecasting.preprocessing.grouped import GroupedTransformer
from sklearn.pipeline import Pipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.preprocessing.log_scale import LogTransformer
from hydro_forecasting.data.preprocessing import run_hydro_processor

---

In [3]:
feature_pipeline = GroupedTransformer(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["total_precipitation_sum", "potential_evaporation_sum_ERA5_LAND"],
    group_identifier="gauge_id",
)

target_pipeline = GroupedTransformer(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": ["ele_mt_sav"]},
}

In [4]:
feature_pipeline.columns

['total_precipitation_sum', 'potential_evaporation_sum_ERA5_LAND']

In [5]:
result = run_hydro_processor(
    input_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA",
    output_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/testing_run_hydro_processor",
    required_columns=["streamflow", "total_precipitation_sum", "potential_evaporation_sum_ERA5_LAND"],
    preprocessing_config=preprocessing_config,
    static_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA",
    processes=8,
    basin_ids=["CA_15030", "CA_15013", "CA_15014", "CA_15015", "CA_15016", "CA_15017", "CA_15018", "CA_15019"],
)

# Access the results
quality_report = result["quality_report"]
fitted_pipelines = result["fitted_pipelines"]
processed_dir = result["processed_dir"]

Found 3 basin files
Loading caravan attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_caravan_CA.parquet
Loading hydroatlas attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_hydroatlas_CA.parquet
Loading other attributes from /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes/CA/attributes_other_CA.parquet
Merging 3 attribute DataFrames
Loaded static attributes for 3 basins
Fitting preprocessing pipelines on all basins...


Loading basin data for fitting: 100%|██████████| 78/78 [00:00<00:00, 263.93it/s]


Loaded 3 basins for pipeline fitting
Split data into train (271191 rows), val (135581 rows), test (135656 rows)
Fitted 3 pipelines


Processing basins: 100%|██████████| 3/3 [00:00<00:00,  4.36it/s]

Processing static attributes...
Saved transformed static attributes for 2 basins to /Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/testing_run_hydro_processor/processed_static_data/static_attributes.parquet
Processing complete. 2 basins retained out of 3.
1 basins excluded due to quality issues.





In [6]:
fitted_pipelines

{'features': GroupedTransformer(columns=['total_precipitation_sum',
                             'potential_evaporation_sum_ERA5_LAND'],
                    group_identifier='gauge_id',
                    pipeline=Pipeline(steps=[('scaler',
                                              StandardScaleTransformer())])),
 'target': GroupedTransformer(columns=['streamflow'], group_identifier='gauge_id',
                    pipeline=Pipeline(steps=[('scaler',
                                              StandardScaleTransformer())])),
 'static': Pipeline(steps=[('scaler', StandardScaleTransformer())])}

## Checking if it worked:

In [7]:
input_dir = Path(
    "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv/CA"
)
output_dir = Path(
    "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/testing_run_hydro_processor/processed_data"
)

# List the files in the output directory
for file in output_dir.iterdir():
    print(file.name)


CA_15013.parquet
CA_15016.parquet


In [8]:
data = pd.read_parquet(
    "/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/testing_run_hydro_processor/processed_static_data/static_attributes.parquet"
)

data["gauge_id"].unique()

array(['CA_15013', 'CA_15016'], dtype=object)