In [6]:
import numpy as np
import os
import dataset_utils as ds

In [18]:
speaker_dataset_path = "/vast/ci411/gerbil_data/vocal_datasets/preprocessed/speaker_dataset_full.h5"
big_simulated_dataset_path = "/vast/ci411/gerbil_data/vocal_datasets/simulated/default-big_vocal.h5"
experiment_dir = '/vast/ci411/gerbil_data/vocal_datasets/combined/'

In [8]:
n_events = ds.count_events(speaker_dataset_path)

In [26]:
n_sim = ds.count_events(big_simulated_dataset_path)

In [10]:
#define validation set
val_ratio = 0.2
all_idx = np.arange(n_events)
np.random.shuffle(all_idx)
split = int(val_ratio * n_events)
val_idx = all_idx[:split]
train_idx = all_idx[split:]


In [13]:
target_dir = os.path.join(experiment_dir, "00sim")
train_path = os.path.join(target_dir, "train_set.h5")
val_path = os.path.join(target_dir, "val_set.h5")

if os.path.exists(train_path):
    os.remove(train_path)
if os.path.exists(val_path):
    os.remove(val_path)

print("Extracting Training Data")
ds.extract_events(speaker_dataset_path, train_path, train_idx)
print("Extracting Validation Data")
ds.extract_events(speaker_dataset_path, val_path, val_idx)

Extracting Training Data


100%|██████████| 56732/56732 [08:43<00:00, 108.43it/s]


Extracting Validation Data


100%|██████████| 14182/14182 [02:07<00:00, 110.86it/s]


In [28]:
experiments = os.listdir(experiment_dir)
for exp in experiments[1:]:
    exp_dir = os.path.join(experiment_dir, exp)
    train_path = os.path.join(exp_dir, "train_set.h5")
    ratio = int(exp[:2])/10
    n_training = ds.count_events(train_path)
    sim_sample = int(n_training*ratio)
    idx_list = np.random.choice(n_sim, sim_sample, replace=False)
    print(f"Extracting {sim_sample} samples to {train_path}")
    ds.extract_events(big_simulated_dataset_path, train_path, idx_list)

Extracting 5673 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/01sim/train_set.h5


100%|██████████| 5673/5673 [02:39<00:00, 35.53it/s]


Extracting 11346 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/02sim/train_set.h5


100%|██████████| 11346/11346 [05:13<00:00, 36.21it/s]


Extracting 17019 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/03sim/train_set.h5


100%|██████████| 17019/17019 [08:23<00:00, 33.81it/s]


Extracting 22692 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/04sim/train_set.h5


100%|██████████| 22692/22692 [10:35<00:00, 35.72it/s]


Extracting 28366 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/05sim/train_set.h5


100%|██████████| 28366/28366 [13:15<00:00, 35.64it/s]


Extracting 34039 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/06sim/train_set.h5


100%|██████████| 34039/34039 [16:11<00:00, 35.02it/s]


Extracting 39712 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/07sim/train_set.h5


100%|██████████| 39712/39712 [19:32<00:00, 33.88it/s]


Extracting 45385 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/08sim/train_set.h5


100%|██████████| 45385/45385 [21:31<00:00, 35.14it/s]


Extracting 51058 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/09sim/train_set.h5


100%|██████████| 51058/51058 [22:52<00:00, 37.19it/s]


Extracting 56732 samples to /vast/ci411/gerbil_data/vocal_datasets/combined/10sim/train_set.h5


100%|██████████| 56732/56732 [24:42<00:00, 38.27it/s]
