In [17]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

PROJECT_ROOT = os.path.abspath(os.path.join(sys.path[0], os.pardir))
sys.path.append(PROJECT_ROOT)
from src.datasets import classify3


In [9]:
# Load up some data
path_to_data_file = '../../../data/Streamflow/fpe_stations/AVERYBB/AVERYBB-20230829/data/flow-images.csv'
data = pd.read_csv(path_to_data_file)
data['timestamp'] = pd.to_datetime(data['timestamp'])


In [10]:
# def add_time_segment_columns(data: pd.DataFrame, window_size: str, include_year: bool = True) -> pd.DataFrame:
#     """
#     Adds columns to the DataFrame that represent the time segments (windows) that each row belongs to.

#     Args:
#         data (pd.DataFrame): The input DataFrame with a "timestamp" column.
#         window_size (str): The size of the time segments (windows). Must be "week" or "day".
#         include_year (bool): Whether or not to include the "year" column. Default is True.

#     Returns:
#         pd.DataFrame: The input DataFrame with additional columns "window" and "year" (if include_year is True) representing the time segments.

#     Example:
#         >>> data = pd.DataFrame({
#         ...     "timestamp": pd.date_range("2022-01-01", "2022-01-31", freq="H"),
#         ...     "value": np.random.randn(744)
#         ... })
#         >>> data = add_time_segment_columns(data, "day", include_year=True)
#         >>> data.head()
#                     timestamp     value  window  year
#         0 2022-01-01 00:00:00 -0.051771       1  2022
#         1 2022-01-01 01:00:00 -0.358163       1  2022
#         2 2022-01-01 02:00:00 -0.080947       1  2022
#         3 2022-01-01 03:00:00 -0.126756       1  2022
#         4 2022-01-01 04:00:00 -0.401780       1  2022

#     Raises:
#         ValueError: If window_size is not "week" or "day".
#     """
#     if window_size == "week":
#         data["window"] = data["timestamp"].dt.isocalendar().week
#     elif window_size == "day":
#         data["window"] = data["timestamp"].dt.day
#     else:
#         raise ValueError("Window size must be 'week' or 'day'.")
    
#     if include_year:
#         data["year"] = data["timestamp"].dt.isocalendar().year
    
#     return data



In [59]:
class WindowedDatasetSplitter:
    """
    Splits a dataset into train, validation, and test sets based on the time segments (windows) of the data.
    """
    
    def __init__(self, data: pd.DataFrame, window_size: str, include_year: bool = True):
        """
        Initializes the WindowedDatasetSplitter class with the specified data, window size, and include_year flag.

        Args:
            data (pd.DataFrame): The input DataFrame with a "timestamp" column.
            window_size (str): The size of the time segments (windows). Must be "week" or "day".
            include_year (bool): Whether or not to include the "year" column. Default is True.

        Example:
            >>> data = pd.DataFrame({
            ...     "timestamp": pd.date_range("2022-01-01", "2022-01-31", freq="H"),
            ...     "value": np.random.randn(744)
            ... })
            >>> splitter = WindowedDatasetSplitter(data, "day", include_year=True)
            >>> splitter.data.head()
                        timestamp     value  window  year
            0 2022-01-01 00:00:00 -0.051771       1  2022
            1 2022-01-01 01:00:00 -0.358163       1  2022
            2 2022-01-01 02:00:00 -0.080947       1  2022
            3 2022-01-01 03:00:00 -0.126756       1  2022
            4 2022-01-01 04:00:00 -0.401780       1  2022

        Raises:
            ValueError: If window_size is not "week" or "day".
        """
        self.data = data
        self.window_size = window_size
        self.include_year = include_year
        self._get_windows()
    
    def _get_windows(self):
        if self.window_size == "week":
            self.data["window"] = self.data["timestamp"].dt.isocalendar().week
        elif self.window_size == "day":
            self.data["window"] = self.data["timestamp"].dt.day
        else:
            raise ValueError("Window size must be 'week' or 'day'.")
        
        if self.include_year:
            self.data["year"] = self.data["timestamp"].dt.isocalendar().year
    
    def shuffle_split(self, train_size: float, val_size: float, test_size: float, random_state: int = 42):
        """
        Randomly assigns the data time segments into train, validation, and test sets of specified sizes.

        Args:
            train_size (float): The size of the training set as a fraction of the total time segments in the dataset.
            val_size (float): The size of the validation set as a fraction of the total time segments in the dataset.
            test_size (float): The size of the test set as a fraction of the total time segments in the dataset.
            random_state (int): The random state to use for the shuffle. Default is 42.

        Returns:
            pd.DataFrame, pd.DataFrame, pd.DataFrame: The train, validation, and test sets.

        Example:
            >>> data = pd.DataFrame({
            ...     "timestamp": pd.date_range("2022-01-01", "2022-01-31", freq="H"),
            ...     "value": np.random.randn(744)
            ... })
            >>> splitter = WindowedDatasetSplitter(data, "day", include_year=True)
            >>> train, val, test = splitter.shuffle_split(0.7, 0.2, 0.1)
            >>> train.shape
            (520, 4)
            >>> val.shape
            (149, 4)
            >>> test.shape
            (75, 4)
        """
        np.testing.assert_almost_equal(train_size + val_size + test_size, 1.0)
        if self.include_year:
            grouped = self.data.groupby(["year", "window"])
        else:
            grouped = self.data.groupby("window")
        windows = grouped.groups.keys()
        windows = list(windows)
        np.random.seed(random_state)
        np.random.shuffle(windows)
        train_windows = sorted(windows[:int(train_size * len(windows))])
        val_windows = sorted(windows[int(train_size * len(windows)):int((train_size + val_size) * len(windows))])
        test_windows = sorted(windows[int((train_size + val_size) * len(windows)):])
        train = pd.concat([grouped.get_group(tuple(train_windows[i])) for i in range(len(train_windows))])
        val = pd.concat([grouped.get_group(tuple(val_windows[i])) for i in range(len(val_windows))])
        test = pd.concat([grouped.get_group(tuple(test_windows[i])) for i in range(len(test_windows))])
        return {
            "train": train,
            "val": val,
            "test": test
        }
    
    def stratified_shuffle_split(self, train_size: float, val_size: float, test_size: float, stratify_by: str = "flow_cfs", random_state: int = 42):
        np.testing.assert_almost_equal(train_size + val_size + test_size, 1.0)
        df = self.data.copy()
        df.sort_values(by="timestamp", inplace=True, ignore_index=True)

        # compute mean value for each time window
        time_window_cols = ["year", "window"] if self.include_year else ["window"]
        stratify_by_cols = [stratify_by] + time_window_cols
        window_val_means = (
            df[stratify_by_cols]
            .groupby(time_window_cols)
            .mean()
            .rename(columns={stratify_by: "mean_"+stratify_by})
        )
        window_val_quantiles = np.quantile(
            window_val_means["mean_"+stratify_by].values, [0.25, 0.75], axis=0
        )
        window_val_means["value_class"] = window_val_means["mean_"+stratify_by].map(
            lambda x: classify3(window_val_quantiles[0], window_val_quantiles[1], x)
        )
        window_val_means["window_index"] = range(len(window_val_means.index))        

        df = (
            df.set_index(time_window_cols)
            .join(window_val_means, on=time_window_cols)
            .reset_index()
        )
        windows = window_val_means.reset_index()

        X = windows["window_index"]
        y = windows["value_class"]
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
        window_idx_train_val, window_idx_test = list(sss.split(X, y))[0]
        X_trv = [X[i] for i in sorted(window_idx_train_val)]
        X_t = [X[i] for i in sorted(window_idx_test)]
        y_trv = [y[i] for i in sorted(window_idx_train_val)]
        # y_t = [y[i] for i in sorted(window_idx_test)]

        rescaled_frac_val = val_size / (1 - test_size)
        sss_trv = StratifiedShuffleSplit(
            n_splits=1, test_size=rescaled_frac_val, random_state=random_state + 1
        )
        window_idx_train, window_idx_val = list(sss_trv.split(X_trv, y_trv))[0]
        X_tr = [X_trv[i] for i in sorted(window_idx_train)]
        X_v = [X_trv[i] for i in sorted(window_idx_val)]
        # y_tr = [y_trv[i] for i in sorted(window_idx_train)]
        # y_v = [y_trv[i] for i in sorted(window_idx_val)]

        train_inds = np.where(df.window_index.isin(X_tr))[0]
        val_inds = np.where(df.window_index.isin(X_v))[0]
        test_inds = np.where(df.window_index.isin(X_t))[0]
        return {
            "train": df.iloc[train_inds],
            "val": df.iloc[val_inds],
            "test": df.iloc[test_inds],
        }




In [60]:
splitter = WindowedDatasetSplitter(data, "week")
splitter.data.head(2)

Unnamed: 0,station_name,station_id,imageset_id,image_id,timestamp,filename,url,flow_cfs,window,year
0,Avery Brook_Bridge_01171000,12,95,153582,2021-03-10 16:01:17+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,10,2021
1,Avery Brook_Bridge_01171000,12,95,153583,2021-03-10 16:02:24+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,10,2021


In [63]:
randomly_grouped = splitter.shuffle_split(0.7, 0.2, 0.1)
stratified_grouped = splitter.stratified_shuffle_split(0.7, 0.2, 0.1)



Unnamed: 0,year,window,station_name,station_id,imageset_id,image_id,timestamp,filename,url,flow_cfs,mean_flow_cfs,val_class,window_index
0,2021,10,Avery Brook_Bridge_01171000,12,95,153582,2021-03-10 16:01:17+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,8.019677,med,0
1,2021,10,Avery Brook_Bridge_01171000,12,95,153583,2021-03-10 16:02:24+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,8.019677,med,0
2,2021,10,Avery Brook_Bridge_01171000,12,95,153584,2021-03-10 16:16:18+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,8.019677,med,0
3,2021,10,Avery Brook_Bridge_01171000,12,95,153585,2021-03-10 16:31:18+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.39,8.019677,med,0
4,2021,10,Avery Brook_Bridge_01171000,12,95,153586,2021-03-10 16:46:18+00:00,imagesets/fec63b82-d9fa-4844-ab9b-cda8999122b0...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,3.390867,8.019677,med,0


59101
59101
59101


In [None]:
# class DatasetSplitter(object):
#     """Splitters split Datasets into train/validation/test sets."""

#     def split(self, dataset, frac_train, frac_val, frac_test) -> Dict:
#         np.testing.assert_almost_equal(frac_train + frac_val + frac_test, 1.0)
#         num_datapoints = len(dataset)
#         train_cutoff = int(frac_train * num_datapoints)
#         val_cutoff = int((frac_train + frac_val) * num_datapoints)
#         indices = np.arange(num_datapoints)
#         train_indices = indices[:train_cutoff]
#         val_indices = indices[train_cutoff:val_cutoff]
#         test_indices = indices[val_cutoff:]
#         return {
#             "train": dataset.iloc[train_indices],
#             "val": dataset.iloc[val_indices],
#             "test": dataset.iloc[test_indices],
#         }


# class RandomStratifiedWindowFlow(DatasetSplitter):
#     def split(
#         self, dataset, frac_train, frac_val, frac_test, seed=1, window="week"
#     ) -> Dict:
#         np.testing.assert_almost_equal(frac_train + frac_val + frac_test, 1.0)

#         df = dataset.copy()
#         df.sort_values(by="timestamp", inplace=True, ignore_index=True)
#         if window == "week":
#             df["window"] = df["timestamp"].dt.isocalendar().week
#         elif window == "day":
#             df["window"] = df["timestamp"].dt.day
#         else:
#             raise ValueError(
#                 "Window must be 'week' or 'day'. Other windows not yet supported."
#             )
#         df["year"] = df["timestamp"].dt.isocalendar().year

#         window_flow_means = (
#             df[["flow_cfs", "year", "window"]]
#             .groupby(["year", "window"])
#             .mean()
#             .rename(columns={"flow_cfs": "mean_flow_cfs"})
#         )
#         # for sites with no flow data, randomly assign a flow value
#         if np.all(np.isnan(window_flow_means["mean_flow_cfs"].values)):
#             window_flow_means["mean_flow_cfs"] = np.random.uniform(
#                 0, 1, len(window_flow_means)
#             )
#         window_flow_quantiles = np.quantile(
#             window_flow_means["mean_flow_cfs"].values, [0.25, 0.75], axis=0
#         )
#         window_flow_means["flow_class"] = window_flow_means["mean_flow_cfs"].map(
#             lambda x: classify3(window_flow_quantiles[0], window_flow_quantiles[1], x)
#         )
#         window_flow_means["window_index"] = range(len(window_flow_means.index))

#         df = (
#             df.set_index(["year", "window"])
#             .join(window_flow_means, on=["year", "window"])
#             .reset_index()
#         )
#         windows = window_flow_means.reset_index()

#         X = windows["window_index"]
#         y = windows["flow_class"]
#         sss = StratifiedShuffleSplit(n_splits=1, test_size=frac_test, random_state=seed)
#         window_idx_train_val, window_idx_test = list(sss.split(X, y))[0]
#         X_trv = [X[i] for i in sorted(window_idx_train_val)]
#         X_t = [X[i] for i in sorted(window_idx_test)]
#         y_trv = [y[i] for i in sorted(window_idx_train_val)]
#         # y_t = [y[i] for i in sorted(window_idx_test)]

#         rescaled_frac_val = frac_val / (1 - frac_test)
#         sss_trv = StratifiedShuffleSplit(
#             n_splits=1, test_size=rescaled_frac_val, random_state=seed + 1
#         )
#         window_idx_train, window_idx_val = list(sss_trv.split(X_trv, y_trv))[0]
#         X_tr = [X_trv[i] for i in sorted(window_idx_train)]
#         X_v = [X_trv[i] for i in sorted(window_idx_val)]
#         # y_tr = [y_trv[i] for i in sorted(window_idx_train)]
#         # y_v = [y_trv[i] for i in sorted(window_idx_val)]

#         train_inds = np.where(df.window_index.isin(X_tr))[0]
#         val_inds = np.where(df.window_index.isin(X_v))[0]
#         test_inds = np.where(df.window_index.isin(X_t))[0]
#         return {
#             "train": df.iloc[train_inds],
#             "val": df.iloc[val_inds],
#             "test": df.iloc[test_inds],
#         }