In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
path_to_jeff_data = Path('../../../../data/Streamflow/Jeff_data/stations')
station_folders = sorted([x.stem for x in path_to_jeff_data.iterdir() if x.is_dir()])

random_seeds = [4275, 3274, 1632, 8436, 2927]
for seed in random_seeds:
    np.random.seed(seed)
    print(f'Random seed: {seed}')
    for station in station_folders:
        # load data for station
        path_to_station = path_to_jeff_data / station
        print(f'Loading data for station {station}')
        print(f'Path to station: {path_to_station}')
        station_pairs = pd.read_csv(path_to_station / 'input' / 'pairs.csv')
        station_pairs['split'].value_counts()
        
        # get train and val splits
        train_pairs = station_pairs[station_pairs['split'] == 'train']
        val_pairs = station_pairs[station_pairs['split'] == 'val']

        # create nested subsets of train split
        num_train = len(train_pairs)
        subset_size_increment = 500
        subset_sizes = np.arange(subset_size_increment, num_train, subset_size_increment)
        print(f'Subset sizes: {subset_sizes}')
        random_indices = np.random.permutation(num_train)
        train_pairs = train_pairs.iloc[random_indices]
        nested_subsets = [train_pairs.iloc[:size] for size in subset_sizes]
        
        # create new directory for this seed
        new_dir = path_to_station / f'input_{seed}'
        new_dir.mkdir(parents=True, exist_ok=True)
        
        # save new data with nested subsets of train split with val split
        for i, subset in enumerate(nested_subsets):
            subset_station_pairs = pd.concat([subset, val_pairs], ignore_index=True)
            subset_num_train = len(subset_station_pairs[subset_station_pairs['split'] == 'train'])
            subset_station_pairs.to_csv(new_dir / f'pairs_{subset_num_train}.csv', index=False)

Random seed: 4275
Loading data for station 10-West Brook Lower_01171090
Path to station: ../../../../data/Streamflow/Jeff_data/stations/10-West Brook Lower_01171090
Subset sizes: [ 500 1000 1500]
Loading data for station 12-Avery Brook_Bridge_01171000
Path to station: ../../../../data/Streamflow/Jeff_data/stations/12-Avery Brook_Bridge_01171000
Subset sizes: [ 500 1000 1500 2000 2500]
Loading data for station 13-Avery Brook_Side_01171000
Path to station: ../../../../data/Streamflow/Jeff_data/stations/13-Avery Brook_Side_01171000
Subset sizes: [ 500 1000 1500]
Loading data for station 14-Avery Brook_River Right_01171000
Path to station: ../../../../data/Streamflow/Jeff_data/stations/14-Avery Brook_River Right_01171000
Subset sizes: [ 500 1000 1500]
Loading data for station 15-Avery Brook_River Left_01171000
Path to station: ../../../../data/Streamflow/Jeff_data/stations/15-Avery Brook_River Left_01171000
Subset sizes: [ 500 1000 1500]
Loading data for station 16-West Brook Reservoir_011