In [1]:
from transfer_learning_publication.data import CaravanDataSource
from transfer_learning_publication.transforms import PipelineBuilder, Log, ZScore

In [2]:
caravan = CaravanDataSource(
    base_path="/Users/cooper/Desktop/LSH_hive_data",
    region="camels"
)

basins = caravan.list_gauge_ids()

ts_data = caravan.get_timeseries(
    gauge_ids=basins[:2],
    variables=["streamflow", "total_precipitation_sum"],
)

In [4]:
ts_data.collect_schema().names()

['REGION_NAME', 'gauge_id', 'streamflow', 'total_precipitation_sum', 'date']

In [None]:
preprocessing_pipelines = (
    PipelineBuilder(group_identifier="gauge_id")
    .add_per_basin(Log(), columns=["streamflow"])
    .add_global(ZScore(), columns=["streamflow", "total_precipitation_sum"])
    .build()
)

data_transformed = preprocessing_pipelines.fit_transform(ts_data)
data_inv = preprocessing_pipelines.inverse_transform(data_transformed)

In [None]:
ts_data.head(), data_transformed.head(), data_inv.head()

In [None]:
caravan.write_timeseries(
    data_transformed,
    "/Users/cooper/Desktop/transfer-learning-publication/data",
    True
)