In [1]:
import os
import sys
sys.path.insert(0, "..")

from tqdm import tqdm

import torch

from my_datasets import *

In [2]:
dataset = AutoregKStepsTokensDataset(
    num_vars = 64,
    num_rules_range = (8, 64),
    ante_prob_range = (0.2, 0.3),
    conseq_prob_range = (0.2, 0.3),
    chain_len_range = (2, 5),
    dataset_len = 1000,
    num_steps = 3
)

In [3]:
item = dataset[0]
tokens = item["tokens"]
labels = item["labels"]
tokens.shape, labels.shape

(torch.Size([64, 129]), torch.Size([3, 64]))

In [4]:
num_dones, cum_label_hots = 0, 0
pbar = tqdm(range(len(dataset)))
for i in pbar:
    item = dataset[i]
    num_dones += 1
    cum_label_hots += item["labels"].sum()
    avg_hots = cum_label_hots / (num_dones * dataset.inner_dataset.num_vars * dataset.num_steps)
    pbar.set_description(f"avghots: {avg_hots:.3f}")

avghots: 0.724: 100%|██████████| 1000/1000 [00:01<00:00, 945.47it/s]


In [15]:
sf_dataset = TiledAutoregKStepsTokensDataset(
    num_vars = 16,
    num_rules_range = (32, 64),
    ante_prob_range = (0.2, 0.3),
    conseq_prob_range = (0.2, 0.3),
    chain_len_range = (4, 8),
    num_presteps_range = (0, 4),
    num_todo_steps = 3,
    dataset_len = 1000
)

In [16]:
sf_dataset[0]["attention_mask"]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

In [31]:
data = sf_dataset[0]

In [32]:
data

{'tokens': tensor([[1., 1., 0.,  ..., 0., 1., 0.],
         [0., 1., 1.,  ..., 1., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([[1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1],
         [1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1],
         [1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]])}

In [33]:
sf_dataset[1]

{'tokens': tensor([[1., 0., 1.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
 'labels': tensor([[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
         [1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
         [1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]])}

In [34]:
sf_dataset[2]

{'tokens': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 1., 0., 0.],
         [1., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([[1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0],
         [1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
         [1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0]])}