In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
import polars as pl
from returns.result import Success
from sklearn.pipeline import Pipeline

from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.data.clean_data import (
    clean_data,
    save_quality_report_to_json,
    summarize_quality_reports_from_folder,
)
from hydro_forecasting.data.preprocessing import ProcessingConfig
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.preprocessing.time_series_preprocessing import (
    fit_time_series_pipelines,
    load_time_series_pipelines,
    save_time_series_pipelines,
    transform_time_series_data,
)

---

In [3]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = ["streamflow"]

In [4]:
configs = ProcessingConfig(
    required_columns=forcing_features + target,
)

In [5]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()
# basin_ids = ["CA_15030"]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

time_series = caravan_ca.get_time_series()[forcing_features + ["date", "gauge_id"] + target]

In [6]:
type(time_series)
time_series = pl.from_pandas(time_series).lazy()

## Testing the `clean_data` function

In [7]:
result = clean_data(time_series, configs)

if isinstance(result, Success):
    cleaned_df, quality_report = result.unwrap()
else:
    error_msg = result.failure()
    print("Data cleaning failed:", error_msg)

INFO: Processed 78 basins, 77 passed quality checks


In [8]:
base_path = "/Users/cooper/Desktop/hydro-forecasting/tests"


for gauge_id, report in quality_report.items():
    succs_flag, path, error = save_quality_report_to_json(
        report=report,
        path=f"/Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/{gauge_id}.json",
    )

    if succs_flag:
        print(f"Quality report saved to {path}")
    else:
        print(f"Failed to save quality report: {error}")

Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15214.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15102.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15022.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15081.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_16101.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_16136.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_17329.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15039.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_15189.json
Quality report saved to /Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/CA_16146.json


In [9]:
summary_results = summarize_quality_reports_from_folder(
    folder_path="/Users/cooper/Desktop/hydro-forecasting/tests/quality_reports",
    save_path="/Users/cooper/Desktop/hydro-forecasting/tests/summary.json",
)

if isinstance(summary_results, Success):
    summary_df = summary_results.unwrap()
    print("Summary DataFrame:")
    print(summary_df)
else:
    error_msg = summary_results.failure()
    print("Failed to summarize quality reports:", error_msg)

Summary DataFrame:
SummaryQualityReport(original_basins=103, passed_basins=102, failed_basins=1, excluded_basins={'CA_15030': 'Insufficient training data (2.13 years available).                          Minimum required training years: 5.0'})


## Testing the pipeline fitting and transforming

In [10]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [11]:
# Convert cleaned_df back to a pandas DataFrame
cleaned_df = cleaned_df.to_pandas()

result = fit_time_series_pipelines(
    cleaned_df,
    features_pipeline=feature_pipeline,
    target_pipeline=target_pipeline,
)

if isinstance(result, Success):
    fitted_pipelines = result.unwrap()
else:
    error_msg = result.failure()
    print("Fitting time series pipelines failed:", error_msg)

In [12]:
type(cleaned_df)

pandas.core.frame.DataFrame

In [13]:
result = transform_time_series_data(cleaned_df, fitted_pipelines)

if isinstance(result, Success):
    transformed_df = result.unwrap()
else:
    error_msg = result.failure()
    print("Transforming time series data failed:", error_msg)

In [14]:
def validate_transformation(df, columns):
    for column in columns:
        # Make sure mean = 0 and std = 1
        mean = df[column].mean()
        std = df[column].std()
        if not (abs(mean) < 1e-6 and abs(std - 1) < 1e-6):
            print(f"Column {column} failed validation: mean = {mean}, std = {std}")
            return False
    return True


validation = validate_transformation(
    transformed_df,
    target + forcing_features,
)

if validation:
    print("All columns passed validation.")
else:
    print("Some columns failed validation.")

Column streamflow failed validation: mean = 2.4565534051390614e-09, std = 0.9999235272407532
Some columns failed validation.


In [15]:
save_results = save_time_series_pipelines(
    fitted_pipelines, "/Users/cooper/Desktop/hydro-forecasting/tests/fitted_pipelines.joblib"
)

if isinstance(save_results, Success):
    print("Pipelines saved successfully.")
else:
    error_msg = save_results.failure()
    print("Saving pipelines failed:", error_msg)

Pipelines saved successfully.


In [16]:
load_results = load_time_series_pipelines("/Users/cooper/Desktop/hydro-forecasting/tests/fitted_pipelines.joblib")

if isinstance(load_results, Success):
    loaded_pipelines = load_results.unwrap()
    print("Pipelines loaded successfully.")
else:
    error_msg = load_results.failure()
    print("Loading pipelines failed:", error_msg)

Pipelines loaded successfully.


In [17]:
loaded_pipelines.keys()

dict_keys(['features', 'target'])

In [18]:
loaded_pipelines["target"]