In [1]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset  # huggingface datasets

dataset = load_dataset("json", data_files='./data/json/dev.json')
# take only first 100 examples
split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test')  # rename the test split to val
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)


# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
def process(example):
    embds = model.encode(example['sentences'])
    out = {'id': example['id'], 'labels': example['labels'], 'embeddings': embds, 'len': len(embds)}
    return out


# tokenize the dataset
train_dataset = split_dataset['train'].select(range(100)).map(
    process,
    # remove_columns=['sentences'],
    desc="tokenizing the splits",
    num_proc=1,
)
# save the tokenized dataset
train_dataset.save_to_disk('./data/tokenized')

Found cached dataset json (/Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-e65036e96329467f.arrow and /Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-507da4f4a54b4983.arrow
Loading cached processed dataset at /Users/bm/.cache/huggingface/datasets/json/default-14dc56ae28f0b495/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-428753f94b90e10d.arrow


Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [2]:
"""
Convert sentence embeddings and labels for trainable blocks
"""
import config
import random
fake_sent_embedding = model.encode([config.fake_sent])[0]
embedding_size = model.get_sentence_embedding_dimension()
test = False

def create_fake_block(block, lines):
    block_fake = block.copy()
    random.shuffle(block_fake)
    p = random.random()
    if p >= 0.5:
        for i in range(len(block_fake)):
            p = random.random()
            if p >= 0.5:
                l = lines[random.randint(0, len(lines) - 1)]
                block_fake[i] = (l, 0)
    return block_fake

def create_one_instance(block, lines):
    records = []
    fake_block = create_fake_block(block, lines)

    blocks = list(zip(block, fake_block))
    for item in blocks:
        real_sentence = item[0][0]
        real_label = item[0][1]
        fake_sentence = item[1][0]
        records.append((real_sentence,fake_sentence, real_label))
    return records

all_blocks = []

for example in train_dataset:
    labels = example['labels']
    embds = example['embeddings']
    raw_blocks = []
    stride = 1 if test else config.sent_stride
    i = 0
    idx = 0
    while i < len(labels):
        block = [(embd, label) for embd, label in zip(embds[i:i + config.sent_window], labels[i:i + config.sent_window])]
        if len(block) < config.sent_window:
            block.extend([(fake_sent_embedding, 0)] * (config.sent_window - len(block)))
        raw_blocks.append(block)
        i += stride

    if not test:
        raw_blocks = raw_blocks[:int(config.perc_blocks_train * len(raw_blocks))]
        random.shuffle(raw_blocks)

    doc_recs = []
    for rb in raw_blocks:
        records = create_one_instance(rb, embds)
        doc_recs.extend(records)

    # save doc_recs to numpy array
    all_blocks.extend(doc_recs)

In [3]:
len(all_blocks)

2960

In [4]:
import numpy as np
import os
from tqdm import tqdm

dtype = np.dtype([('real', np.float32, embedding_size), ('fake', np.float32, embedding_size), ('label', np.int8)])
data_dir = "./data/processed"
def save_to_numpy(blocks, filename):
    # create file if it doesn't exist
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    # save all_blocks to numpy array
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(len(blocks),))

    # create batches of blocks for faster write
    batch_size = 1024
    batches = [blocks[i:i + batch_size] for i in range(0, len(blocks), batch_size)]
    for batch_idx, batch in tqdm(enumerate(batches), desc=f'writing {filename}'):
        arr_batch = np.array(batch, dtype=dtype)
        arr[batch_idx * batch_size : batch_idx * batch_size + len(arr_batch)] = arr_batch
    arr.flush()

save_to_numpy(all_blocks, os.path.join(data_dir, 'train.bin'))
save_to_numpy(all_blocks, os.path.join(data_dir, 'val.bin'))

writing ./data/processed/train.bin: 3it [00:00, 44.43it/s]
writing ./data/processed/val.bin: 3it [00:00, 46.27it/s]


In [5]:
# load the numpy array from disk
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=dtype, mode='r')
len(train_data)

2960

In [12]:
import torch
from datasets import load_dataset

batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = config.sent_window
device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast


train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=dtype, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=dtype, mode='r')
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = []
    y = []
    for i in ix:
        batch_numpy = data[i:i+block_size]
        # given numpy array strides not a multiple of the element byte size. Copy the numpy array to reallocate the memory.
        batch_numpy = np.copy(batch_numpy)
        x.append(torch.stack([torch.from_numpy( np.copy(batch_numpy['real'])), torch.from_numpy( np.copy(batch_numpy['fake']))]))
        y.append(torch.from_numpy(batch_numpy['label']))
    x = torch.stack(x)
    y = torch.stack(y)
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y



In [28]:
sample_batch = get_batch('train')
assert sample_batch[0].shape == (batch_size, 2, block_size, embedding_size)
assert sample_batch[1].shape == (batch_size, block_size)

# create model
import torch

class TopicSegmentationModel(torch.nn.Module):

    def __init__(self):
        super(TopicSegmentationModel, self).__init__()

        self.linear1 = torch.nn.Linear(block_size*embedding_size, block_size)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(block_size, block_size)

    def forward(self, x):
        x_real = x[:,0,:,:].reshape(batch_size, -1)
        x = self.linear1(x_real)
        x = self.activation(x)
        x = self.linear2(x)
        return x

model = TopicSegmentationModel()
model.to(device)

# create optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

loss_fn = torch.nn.BCEWithLogitsLoss()

# training loop
from tqdm import tqdm

num_epochs = 2
num_batches = len(train_data) // batch_size
for epoch in range(num_epochs):
    # train on training set
    model.train()
    for batch_idx in tqdm(range(num_batches), desc=f'Epoch {epoch}'):
        x, y = get_batch('train')
        optimizer.zero_grad()
        y_hat = model(x)
        y = y.float()
        loss_value = loss_fn(y_hat, y)
        loss_value.backward()
        optimizer.step()
    # evaluate on validation set
    model.eval()
    with torch.no_grad():
        x, y = get_batch('val')
        y_hat = model(x)
        y = y.float()
        loss_value = loss_fn(y_hat, y)
        print(f'Epoch {epoch} validation loss: {loss_value.item()}')


Epoch 0: 100%|██████████| 246/246 [00:00<00:00, 1135.83it/s]


Epoch 0 validation loss: 0.6810091137886047


Epoch 1: 100%|██████████| 246/246 [00:00<00:00, 1200.52it/s]

Epoch 1 validation loss: 0.6719878315925598



