The goal for this notebook is to create functionality to do all the data parsing and preprocessing I need to have good quality inputs for my language model. Basically I'll take a file and chop it up into (probably overlapping) slices. The amount of overlap and length of each slice will be customizable.

In [30]:
import spacy
from torchtext.vocab import GloVe
import torch
from torch import tensor, sin, cos

tokenizer = spacy.load("en_core_web_sm")
glove = GloVe(dim=300)

In [46]:
# slice_offset is the number of tokens separating the start of one slice from the start of the previous.
# slice_offset == slice_length means no overlap, slice_offset == 1 means maximum overlap.
def slice_text(text: str, slice_length, slice_offset) -> [spacy.tokens.span.Span]:
    slices = []
    tokens = tokenizer(text)

    for i in range(0, len(tokens), slice_offset):
        slices.append(tokens[i:i+slice_length])
    return slices

In [47]:
slices = slice_text("Hello, world!", 5, 1)
slices

[Hello, world!, , world!, world!, !]

In [27]:
with open('../data/much_ado_about_nothing_gut.txt', 'r', encoding='utf-8') as file:
    content = file.read()
slice_text(content, 50, 50)

[MUCH ADO ABOUT NOTHING
 
 by William Shakespeare
 
 
 
 
 DRAMATIS PERSONAE
 
 DON PEDRO, Prince of Arragon.
 DON JOHN, his bastard Brother.
 CLAUDIO, a young Lord of Florence.
 BENEDICK, a young Lord of Padua.
 LEONATO, Governor of,
 Messina.
 ANTONIO, his Brother.
 BALTHAZAR, Servant to Don Pedro.
 BORACHIO, follower of Don John.
 CONRADE, follower of Don John.
 DOGBERRY, a Constable.
 VERGES, a Headborough.
 FRIAR FRANCIS.
 A,
 Sexton.
 A Boy.
 
 HERO, Daughter to Leonato.
 BEATRICE, Niece to Leonato.
 MARGARET, Waiting-gentlewoman attending on Hero.
 URSULA, Waiting-gentlewoman attending on Hero.
 
 Messengers, Watch, Attendants, &c.
 ,
 SCENE. Messina.
 
 
 
 Act 1.
 
 Scene I. Before LEONATO'S House.
 
 [Enter LEONATO, HERO, BEATRICE and others, with a Messenger.]
 
 LEONATO.
 I learn in this letter that Don Pedro of Arragon comes this night,
 to Messina.
 
 MESSENGER.
 He is very near by this: he was not three leagues off when I left
 him.
 
 LEONATO.
 How many gentlemen have y

In [54]:
def positional_embedding(word, pos) -> tensor:
    model_dims = 300

    positional_encoding = torch.tensor([0.0] * model_dims)
    for i in range(0, model_dims // 2):
        positional_encoding[2 * i] = sin(torch.tensor(pos / (10000 ** (2 * i / model_dims))))
        positional_encoding[2 * i + 1] = cos(torch.tensor(pos / (10000 ** (2 * i / model_dims))))

    embedding = glove[word]
    embedding += positional_encoding
    return embedding

def encode_input_tokens(tokens, context_len) -> tensor:
    output = torch.zeros(size=[context_len, 300])
    for i, token in enumerate(tokens):
        output[i] = positional_embedding(token.text, i)

    return output

def build_dictionary(file_path) -> (dict, dict):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    tokenizer = spacy.load("en_core_web_sm")
    tokens = tokenizer(content)
    unique_words = set()
    for token in tokens:
        unique_words.add(str(token))
    word_to_id = {str(word): i for i, word in enumerate(unique_words)}
    id_to_word = {i: str(word) for i, word in enumerate(unique_words)}

    return word_to_id, id_to_word

In [65]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

class CompletionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

# Note: slices include features + label. So if you have context length 256, you can set slice length 257 and be fine.
def build_dataset_from_text(text: str, slice_length, slice_offset, word_to_id_dict) -> CompletionDataset:    
    slices = slice_text(text, slice_length, slice_offset)
    features = torch.zeros(size=[len(slices), slice_length - 1, 300])
    labels = torch.zeros(size=[len(slices)]).long()
    for i, slice in enumerate(slices):
        last_token = slice[-1]
        labels[i] = word_to_id_dict[str(last_token)]
        encoding = encode_input_tokens(slice[:-1], slice_length - 1)
        features[i] = encoding
    
    dataset = CompletionDataset(features, labels)
    return dataset

def build_dataset_from_file(filename, slice_length, slice_offset, word_to_id_dict):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
    return build_dataset_from_text(content, slice_length, slice_offset, word_to_id_dict)

In [56]:
word_to_id, id_to_word = build_dictionary('../data/much_ado_about_nothing_gut.txt')


In [66]:
dataset = build_dataset_from_file('../data/much_ado_about_nothing_gut.txt', 257, 257, word_to_id)

In [68]:
len(dataset)

126