In [1]:
import sys
from pathlib import Path

# Add src directory to Python path
project_root = Path.cwd().parent
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to Python path")

Added /Users/cooper/Desktop/hydro-forecasting/src to Python path


In [2]:
import polars as pl
from returns.result import Success
from sklearn.pipeline import Pipeline

from hydro_forecasting.data.caravanify_parquet import (
    CaravanifyParquet,
    CaravanifyParquetConfig,
)
from hydro_forecasting.data.clean_data import (
    clean_data,
    save_quality_report_to_json,
    summarize_quality_reports_from_folder,
)
from hydro_forecasting.data.preprocessing import ProcessingConfig
from hydro_forecasting.preprocessing.grouped import GroupedPipeline
from hydro_forecasting.preprocessing.standard_scale import StandardScaleTransformer
from hydro_forecasting.preprocessing.time_series_preprocessing import (
    fit_time_series_pipelines,
    load_time_series_pipelines,
    save_time_series_pipelines,
    transform_time_series_data,
)

---

In [3]:
forcing_features = [
    "snow_depth_water_equivalent_mean",
    "surface_net_solar_radiation_mean",
    "surface_net_thermal_radiation_mean",
    "potential_evaporation_sum_ERA5_LAND",
    "potential_evaporation_sum_FAO_PENMAN_MONTEITH",
    "temperature_2m_mean",
    "temperature_2m_min",
    "temperature_2m_max",
    "total_precipitation_sum",
]

static_features = [
    # "gauge_id",
    "p_mean",
    "area",
    "ele_mt_sav",
    "high_prec_dur",
    "frac_snow",
    "high_prec_freq",
    "slp_dg_sav",
    "cly_pc_sav",
    "aridity_ERA5_LAND",
    "aridity_FAO_PM",
]

target = ["streamflow"]

In [4]:
configs = ProcessingConfig(
    required_columns=forcing_features + target,
)

In [5]:
config_ca = CaravanifyParquetConfig(
    attributes_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/attributes",
    timeseries_dir="/Users/cooper/Desktop/CaravanifyParquet/CA/post_processed/timeseries/csv",
    shapefile_dir="/Users/cooper/Desktop/CAMELS-CH/data/CARAVANIFY/CA/post_processed/shapefiles",
    gauge_id_prefix="CA",
    use_hydroatlas_attributes=True,
    use_caravan_attributes=True,
    use_other_attributes=True,
)

caravan_ca = CaravanifyParquet(config_ca)
basin_ids = caravan_ca.get_all_gauge_ids()
# basin_ids = ["CA_15030"]

# basin_ids = [bid for bid in basin_ids if bid != "CA_15030"]

caravan_ca.load_stations(basin_ids)

time_series = caravan_ca.get_time_series()[forcing_features + ["date", "gauge_id"] + target]

In [9]:
caravan_ca._load_static_attributes(basin_ids)
statics = caravan_ca.get_static_attributes()
statics

Unnamed: 0,gauge_id,area,country,gauge_lat,gauge_lon,gauge_name,aet_mm_s01,aet_mm_s02,aet_mm_s03,aet_mm_s04,...,high_prec_freq,low_prec_dur,low_prec_freq,moisture_index_ERA5_LAND,moisture_index_FAO_PM,p_mean,pet_mean_ERA5_LAND,pet_mean_FAO_PM,seasonality_ERA5_LAND,seasonality_FAO_PM
0,CA_15013,254.786460,Kyrgyzstan,42.652788,78.921090,15013_Kyrgyzstan,0.574803,1.858523,13.705932,35.421733,...,0.035202,3.227445,0.555020,0.251483,0.589364,2.243786,2.178943,1.192798,1.141084,0.795366
1,CA_15016,330.553623,Kyrgyzstan,42.585012,78.883448,15016_Kyrgyzstan,0.436708,0.877056,10.758896,32.752111,...,0.025684,2.860555,0.484094,0.695427,0.758767,2.835258,1.220026,0.926226,0.693118,0.474648
2,CA_15020,310.918652,Kyrgyzstan,42.471215,78.531209,15020_Kyrgyzstan,0.034713,0.496307,9.859397,31.402012,...,0.023598,2.607448,0.438201,0.789998,0.803484,3.604888,1.148668,0.957226,0.583906,0.390540
3,CA_15022,204.078792,Kyrgyzstan,42.467058,78.537862,15022_Kyrgyzstan,0.030236,1.026004,10.086078,32.057409,...,0.022164,2.675425,0.451369,0.750376,0.780492,3.267226,1.204651,0.964687,0.627135,0.418075
4,CA_15025,318.622912,Kyrgyzstan,42.424441,78.434206,15025_Kyrgyzstan,0.447321,1.103616,10.419676,30.914526,...,0.024250,2.578210,0.431943,0.782148,0.799853,3.717301,1.219889,0.991977,0.611725,0.392526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,CA_17329,1458.923315,Tajikistan,39.201001,68.625162,17329_Tajikistan,4.412864,8.291545,22.890385,49.335976,...,0.035072,3.139078,0.479661,0.498847,0.572292,3.498126,1.562412,1.189357,1.517544,1.247466
74,CA_17338,336.704776,Tajikistan,39.219434,68.495320,17338_Tajikistan,5.060887,9.423805,25.131122,52.214927,...,0.033898,3.153784,0.510691,0.542981,0.601437,3.100668,1.385385,1.178698,1.335271,0.997391
75,CA_17344,1106.347332,Tajikistan,39.489174,67.716582,17344_Tajikistan,6.850364,11.711532,29.982706,60.248460,...,0.040808,3.341915,0.541591,0.295944,0.379404,2.820111,2.007286,1.451853,1.687907,1.487768
76,CA_17453,1980.865238,Tajikistan,38.345023,71.416836,17453_Tajikistan,1.023119,2.959895,14.159885,38.223298,...,0.032986,3.667254,0.543155,0.601215,0.606002,2.327325,0.774188,0.679953,1.554424,1.340067


In [6]:
type(time_series)
time_series = pl.from_pandas(time_series).lazy()

## Testing the `clean_data` function

In [None]:
result = clean_data(time_series, configs)

if isinstance(result, Success):
    cleaned_df, quality_report = result.unwrap()
else:
    error_msg = result.failure()
    print("Data cleaning failed:", error_msg)

In [None]:
base_path = "/Users/cooper/Desktop/hydro-forecasting/tests"


for gauge_id, report in quality_report.items():
    succs_flag, path, error = save_quality_report_to_json(
        report=report,
        path=f"/Users/cooper/Desktop/hydro-forecasting/tests/quality_reports/{gauge_id}.json",
    )

    if succs_flag:
        print(f"Quality report saved to {path}")
    else:
        print(f"Failed to save quality report: {error}")

In [None]:
summary_results = summarize_quality_reports_from_folder(
    folder_path="/Users/cooper/Desktop/hydro-forecasting/tests/quality_reports",
    save_path="/Users/cooper/Desktop/hydro-forecasting/tests/summary.json",
)

if isinstance(summary_results, Success):
    summary_df = summary_results.unwrap()
    print("Summary DataFrame:")
    print(summary_df)
else:
    error_msg = summary_results.failure()
    print("Failed to summarize quality reports:", error_msg)

## Testing the pipeline fitting and transforming

In [None]:
feature_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=forcing_features,
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

target_pipeline = GroupedPipeline(
    Pipeline([("scaler", StandardScaleTransformer())]),
    columns=["streamflow"],
    group_identifier="gauge_id",
    chunk_size=50,
    n_jobs=-1,
)

static_pipeline = Pipeline([("scaler", StandardScaleTransformer())])

preprocessing_config = {
    "features": {"pipeline": feature_pipeline},
    "target": {"pipeline": target_pipeline},
    "static_features": {"pipeline": static_pipeline, "columns": static_features},
}

In [None]:
# Convert cleaned_df back to a pandas DataFrame
cleaned_df = cleaned_df.to_pandas()

result = fit_time_series_pipelines(
    cleaned_df,
    features_pipeline=feature_pipeline,
    target_pipeline=target_pipeline,
)

if isinstance(result, Success):
    fitted_pipelines = result.unwrap()
else:
    error_msg = result.failure()
    print("Fitting time series pipelines failed:", error_msg)

In [None]:
type(cleaned_df)

In [None]:
result = transform_time_series_data(cleaned_df, fitted_pipelines)

if isinstance(result, Success):
    transformed_df = result.unwrap()
else:
    error_msg = result.failure()
    print("Transforming time series data failed:", error_msg)

In [None]:
def validate_transformation(df, columns):
    for column in columns:
        # Make sure mean = 0 and std = 1
        mean = df[column].mean()
        std = df[column].std()
        if not (abs(mean) < 1e-6 and abs(std - 1) < 1e-6):
            print(f"Column {column} failed validation: mean = {mean}, std = {std}")
            return False
    return True


validation = validate_transformation(
    transformed_df,
    target + forcing_features,
)

if validation:
    print("All columns passed validation.")
else:
    print("Some columns failed validation.")

In [None]:
save_results = save_time_series_pipelines(
    fitted_pipelines, "/Users/cooper/Desktop/hydro-forecasting/tests/fitted_pipelines.joblib"
)

if isinstance(save_results, Success):
    print("Pipelines saved successfully.")
else:
    error_msg = save_results.failure()
    print("Saving pipelines failed:", error_msg)

In [None]:
load_results = load_time_series_pipelines("/Users/cooper/Desktop/hydro-forecasting/tests/fitted_pipelines.joblib")

if isinstance(load_results, Success):
    loaded_pipelines = load_results.unwrap()
    print("Pipelines loaded successfully.")
else:
    error_msg = load_results.failure()
    print("Loading pipelines failed:", error_msg)

In [None]:
loaded_pipelines.keys()

In [None]:
loaded_pipelines["target"]