In [5]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset  # huggingface datasets

dataset = load_dataset("json", data_files='./data/json/dev.json')
# take only first 100 examples
split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test')  # rename the test split to val
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)


# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
def process(example):
    embds = model.encode(example['sentences'])
    out = {'id': example['id'], 'labels': example['labels'], 'embeddings': embds, 'len': len(embds)}
    return out


# tokenize the dataset
train_dataset = split_dataset['train'].select(range(100)).map(
    process,
    # remove_columns=['sentences'],
    desc="tokenizing the splits",
    num_proc=1,
)
# save the tokenized dataset
train_dataset.save_to_disk('./data/tokenized')

Found cached dataset json (/Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-e65036e96329467f.arrow and /Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-507da4f4a54b4983.arrow


tokenizing the splits:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
"""
Convert sentence embeddings and labels for trainable blocks
"""
import config
import random
fake_sent_embedding = model.encode([config.fake_sent])[0]
embedding_size = model.get_sentence_embedding_dimension()
test = False

def create_fake_block(block, lines):
    block_fake = block.copy()
    random.shuffle(block_fake)
    p = random.random()
    if p >= 0.5:
        for i in range(len(block_fake)):
            p = random.random()
            if p >= 0.5:
                l = lines[random.randint(0, len(lines) - 1)]
                block_fake[i] = (l, 0)
    return block_fake

def create_one_instance(block, lines):
    records = []
    fake_block = create_fake_block(block, lines)

    blocks = list(zip(block, fake_block))
    for item in blocks:
        real_sentence = item[0][0]
        real_label = item[0][1]
        fake_sentence = item[1][0]
        records.append((real_sentence,fake_sentence, real_label))
    return records

for example in train_dataset:
    labels = example['labels']
    embds = example['embeddings']
    raw_blocks = []
    stride = 1 if test else config.sent_stride
    i = 0
    idx = 0
    while i < len(labels):
        block = [(embd, label) for embd, label in zip(embds[i:i + config.sent_window], labels[i:i + config.sent_window])]
        if len(block) < config.sent_window:
            block.extend([(fake_sent_embedding, 0)] * (config.sent_window - len(block)))
        raw_blocks.append(block)
        i += stride

    if not test:
        raw_blocks = raw_blocks[:int(config.perc_blocks_train * len(raw_blocks))]
        random.shuffle(raw_blocks)

    doc_recs = []
    for rb in raw_blocks:
        records = create_one_instance(rb, embds)
        doc_recs.extend(records)

    # save doc_recs to numpy array





{'sentences': ['Michael Lynn Downs (born June 9, 1959) is a former American football safety in the NFL, having played for the Dallas Cowboys (1981–1988) and the Phoenix Cardinals (1989).', 'He is a 1977 graduate of South Oak Cliff High School in Dallas.', 'Downs graduated from South Oak Cliff High School in Dallas, Texas.', 'During high school, he was a member of the Honor Society.', 'He served as team captain of the football team and was selected as an All-District in football.', 'He also was a letterman in Track and Field.', 'Downs graduated from Rice University in 1981.', 'He received a BA in Business Management, Political Science and Physical Education.', 'While in college he was selected as All-Southwest conference and All-American honorable mention in football, as well as having served as team captain.', 'He was also punked by legendary Oilers running back, Earl Campbell.', 'Downs was signed by the Dallas Cowboys in 1981 as an undrafted free agent safety.', 'He was so impressive 

In [4]:
# concatenate all the ids in each dataset into one large file we can use for training
dset = train_dataset
arr_len = np.sum(dset['len'])
filename = os.path.join(os.path.dirname(__file__), f'tokenized.bin')
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
total_batches = 1024

idx = 0
for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
    # Batch together samples for faster write
    batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
    arr_batch = np.concatenate(batch['ids'])
    # Write into mmap
    arr[idx : idx + len(arr_batch)] = arr_batch
    idx += len(arr_batch)
arr.flush()

KeyboardInterrupt: 

In [3]:
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
tr = SentenceTransformer('all-MiniLM-L6-v2')

class MyDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.dataset = load_dataset("json", data_files="./data/json/val.json")['train']

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item["sentences"]
        label = item["labels"]
        encoding = tr.encode(text)
        return encoding, torch.tensor(label)

    def __len__(self):
        return len(self.dataset)

from torch.utils.data import DataLoader

ds = MyDataset()
train_dataloader = DataLoader(ds, batch_size=64, shuffle=True)

for data in train_dataloader:
    print(data)
    break

2023-05-04 22:28:44,227 [INFO] Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-04 22:28:44,434 [INFO] Use pytorch device: cpu


  0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RuntimeError: stack expects each tensor to be equal size, but got [27, 384] at entry 0 and [68, 384] at entry 1