In [1]:
# This notebook has some rudimentary tests to check my understanding of randomness in dataloading

import numpy as np
from torch.utils.data import DataLoader

from sourcesep.utils.config import load_config
from sourcesep.models.helpers import H5Dataset

import seaborn as sns

sns.set_theme(font_scale=0.8)
%config InlineBackend.figure_format='retina'

In [2]:
n_epochs = 2000
train_steps_per_epoch = 1000
val_steps_per_epoch = 20
batch_size = 2

# data paths
paths = load_config(dataset_key="all")
sim_name = "2023-03-08"
h5_filename = str(paths["root"] / "sims" / f"{sim_name}.h5")

dat = H5Dataset(h5_filename=h5_filename, n_timesamples=2000)

Paths are for dataset tagged: all


In [3]:
print("iterating over Dataset ourselves, we get one sample at a time")
dat_iter_native = iter(dat)
for i in range(2):
    batch = next(dat_iter_native)
    print(i, batch.keys(), batch["O"].shape)

print("")

dat_iter_native = iter(dat)
for i, x in enumerate(dat_iter_native):  # this would know when to stop
    batch = next(dat_iter_native)
    print(i, batch.keys(), batch["O"].shape)
    if i >= 2:
        break

print("\nTorch dataloader - return batches")
dat_iter_torch = DataLoader(dat, shuffle=True, batch_size=20, num_workers=8)
for i, batch in enumerate(dat_iter_torch):
    print(i, batch.keys(), batch["O"].shape)
    if i >= 2:
        break

iterating over Dataset ourselves, we get one sample at a time
0 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) (300, 2000)
1 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) (300, 2000)

0 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) (300, 2000)
1 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) (300, 2000)
2 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) (300, 2000)

Torch dataloader - return batches
0 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) torch.Size([20, 300, 2000])
1 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) torch.Size([20, 300, 2000])
2 dict_keys(['O', 'A', 'N', 'M', 'H_ox', 'H_dox']) torch.Size([20, 300, 2000])


In [4]:
# The draw where the seed is reset is deterministic, but it does not affect the global randomness

for i in range(10):
    random_integers = np.random.randint(low=0, high=10, size=5)
    if i % 3 == 0:
        print("next random draw is with a seed")
        rng = np.random.RandomState(42)
        random_integers = rng.randint(low=0, high=10, size=5)
    print(random_integers)

next random draw is with a seed
[6 3 7 4 6]
[9 4 3 8 8]
[4 0 5 9 2]
next random draw is with a seed
[6 3 7 4 6]
[6 3 7 1 2]
[3 3 0 1 2]
next random draw is with a seed
[6 3 7 4 6]
[1 0 0 6 5]
[6 6 5 5 6]
next random draw is with a seed
[6 3 7 4 6]


In [5]:
# This defines repeatable irrespective of the number of draws.
def genrandoms(seed, n):
    rng = np.random.RandomState(seed)
    for _ in range(n):
        yield rng.randint(low=0, high=10, size=1)
    raise StopIteration


# randomness resets after n draws:
seed = 42
for n_draws in [2, 5, 10]:
    print("Draw with fixed seed, n_draws =", n_draws)
    x = genrandoms(seed=seed, n=n_draws)
    for _ in range(10):
        try:
            print(next(x))
        except:
            break

Draw with fixed seed, n_draws = 2
[6]
[3]
Draw with fixed seed, n_draws = 5
[6]
[3]
[7]
[4]
[6]
Draw with fixed seed, n_draws = 10
[6]
[3]
[7]
[4]
[6]
[9]
[2]
[6]
[7]
[4]


In [6]:
# When seed is set to None, it doesn't reset the randomness
rng = np.random.RandomState(None)
random_integers = rng.randint(low=0, high=10, size=5)
print(random_integers)

rng = np.random.RandomState(None)
random_integers = rng.randint(low=0, high=10, size=5)
print(random_integers)

[9 8 9 6 1]
[6 2 6 8 1]
