In [1]:
import torch
from torch import tensor, sin, cos
from math import sqrt
from torch.nn.functional import softmax
import spacy
from torchtext.vocab import GloVe

Set up our tokenizer and 3rd party embedding library

In [2]:
glove = GloVe(dim=300)
tokenizer = spacy.load("en_core_web_sm")

Define key helper functions used throughout training and inference

In [3]:
def par_attention(queries: tensor, keys: tensor, values: tensor, dim: int) -> tensor:
    raw_weights = torch.bmm(queries, keys.transpose(1, 2))

    mask = torch.tril(torch.ones_like(raw_weights), diagonal=0)
    raw_weights = raw_weights.masked_fill(mask == 0, float('-inf'))
    # print(f"raw_weights.shape:{raw_weights.shape}\nraw_weights: {raw_weights}")

    scale_factor = sqrt(dim)
    scaled_weights = softmax(raw_weights / scale_factor, dim=2)
    # print(f"scaled_weights.shape:{scaled_weights.shape}\nscaled_weights: {scaled_weights}")

    # now scaled weights is a matrix where each row represents the scaled weights produced based on a given query.
    # meanwhile values just has a value vector on each row.

    reshaped_scaled_weights = scaled_weights.view(scaled_weights.shape[0], scaled_weights.shape[1], scaled_weights.shape[2], 1)
    reshaped_values = values.view(values.shape[0], values.shape[1], 1, values.shape[2])

    scaled_values = reshaped_scaled_weights * reshaped_values

    contextualized_values = torch.sum(scaled_values, 2)
    return contextualized_values

def build_dictionary(file_path) -> (dict, dict):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    tokens = tokenizer(content)
    unique_words = set()
    for token in tokens:
        unique_words.add(str(token))
    word_to_id = {str(word): i for i, word in enumerate(unique_words)}
    id_to_word = {i: str(word) for i, word in enumerate(unique_words)}

    return word_to_id, id_to_word

def positional_embedding(word, pos) -> tensor:
    model_dims = 300

    positional_encoding = torch.tensor([0.0] * model_dims)
    for i in range(0, model_dims // 2):
        positional_encoding[2 * i] = sin(torch.tensor(pos / (10000 ** (2 * i / model_dims))))
        positional_encoding[2 * i + 1] = cos(torch.tensor(pos / (10000 ** (2 * i / model_dims))))

    embedding = glove[word]
    embedding += positional_encoding
    return embedding

def encode_input_string(str, context_len) -> tensor:
    tokenizer = spacy.load("en_core_web_sm")
    tokens = tokenizer(str)

    output = torch.zeros(size=[context_len, 300])
    for i, token in enumerate(tokens):
        output[i] = positional_embedding(token.text, i)

    return output

def encode_input_tokens(tokens, context_len) -> tensor:
    output = torch.zeros(size=[context_len, 300])
    for i, token in enumerate(tokens):
        output[i] = positional_embedding(token.text, i)

    return output

# slice_offset is the number of tokens separating the start of one slice from the start of the previous.
# slice_offset == slice_length means no overlap, slice_offset == 1 means maximum overlap.
def slice_text(text: str, slice_length, slice_offset) -> [spacy.tokens.span.Span]:
    slices = []
    tokens = tokenizer(text)

    for i in range(0, len(tokens), slice_offset):
        slices.append(tokens[i:i+slice_length])
    return slices

Define the architecture of the model, including all subcomponents

In [4]:
import torch.nn as nn

class AttentionHead(nn.Module):
    # For simplicity, I assume query, key, and value vectors have the same dimensionality
    def __init__(self, model_dim, vectors_dim):
        super().__init__()
        self.model_dim = model_dim
        self.vectors_dim = vectors_dim
        self.Q_proj = nn.Linear(model_dim, vectors_dim, bias=False)
        self.K_proj = nn.Linear(model_dim, vectors_dim, bias=False)
        self.V_proj = nn.Linear(model_dim, vectors_dim, bias=False)

    def forward(self, x):
        # each row of x is a vector representing the meaning of the token at the corresponding position with whatever context we've attained so far.
        Q = self.Q_proj(x)
        K = self.K_proj(x)
        V = self.V_proj(x)
        # print("Shape of Q matrix: ", Q.shape)
        # print("Shape of K matrix: ", K.shape)
        # print("Shape of V matrix: ", V.shape)
        output = par_attention(Q, K, V, self.vectors_dim)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super().__init__()
        self.att_heads = nn.ModuleList([AttentionHead(model_dim, model_dim // num_heads) for _ in range(num_heads)])
        self.proj = nn.Linear(model_dim, model_dim, bias=False)

    def forward(self, x):
        head_outputs = [head(x) for head in self.att_heads]
        x = torch.concat(head_outputs, dim=2)
        x = self.proj(x)
        return x
        
class TransformerLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_hidden_dim, context_len):
        super().__init__()
        self.attention_block = MultiHeadAttention(model_dim, num_heads)
        self.norm1 = nn.LayerNorm(normalized_shape=[context_len, model_dim])
        self.ff1 = nn.Linear(model_dim, ff_hidden_dim)
        self.ff_relu = nn.ReLU()
        self.ff2 = nn.Linear(ff_hidden_dim, model_dim)
        self.norm2 = nn.LayerNorm(normalized_shape=[context_len, model_dim])

    def forward(self, x):
        x_res = x
        x = self.attention_block(x)
        x += x_res
        x = self.norm1(x)

        x_res = x
        x = self.ff1(x)
        x = self.ff_relu(x)
        x = self.ff2(x)
        x += x_res
        x = self.norm2(x)

        return x

class TransformerNetwork(nn.Module):
    def __init__(self, num_layers, model_dim, att_heads, ff_hidden_dim, context_len, output_dict_size):
        super().__init__()
        # self.trans_layers = nn.ModuleList([TransformerLayer(model_dim, att_heads, ff_hidden_dim, context_len) for _ in range(num_layers)])
        self.word_predictor = nn.Linear(model_dim * context_len, output_dict_size)
        print(f"word_predictor input dimension: {model_dim * context_len}\noutput dimension: {output_dict_size}")

    def forward(self, x):
        print(f"Received x of shape: {x.shape}")
        # for layer in self.trans_layers:
        #     x = layer.forward(x)
        x = x.view(x.shape[0], -1)
        print(f"Reshaped x to shape: {x.shape}")
        x = self.word_predictor(x)
        return x

Build a dictionary based on a file. This is used to limit the subset of tokens that the model is allowed to output, and by extension to reduce the size of the model. The limitation of this is that any time you want a new dictionary/set of allowed outputs you need to rebuild and retrain the model. This can obviously be improved on later, but for now I think it's a good idea to use it to train a bit faster.

In [5]:
word_to_id, id_to_word = build_dictionary('../data/much_ado_about_nothing_gut.txt')

In [6]:
# This approach kinda sucks ay? Cause it assumes I already know what the last token is... Whatever, will iterate on this lol
def encode_inputs(input_list, context_len) -> tensor:
    output = torch.zeros(size=[len(input_list), context_len, 300])
    for i, input in enumerate(input_list):
        output[i] = encode_input_string(input, context_len)
    return output

# def encode_outputs(output_tokens: [str]) -> tensor:
#     output_cats = torch.zeros(size=[len(output_tokens)]).long()
#     for i, token in enumerate(output_tokens):
#         output_cats[i] = word_to_id[token]
#     return output_cats

Tools to quickly build a dataset that can be fed into the model

In [7]:
from torch.utils.data import Dataset

class CompletionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

# Note: slices include features + label. So if you have context length 256, you can set slice length 257 and be fine.
def build_dataset_from_text(text: str, context_len, slice_length, slice_offset, word_to_id_dict, print_slices) -> CompletionDataset:    
    slices = slice_text(text, slice_length, slice_offset)

    if print_slices:
        for i, slice in enumerate(slices):
            print(f"Slice {i}:")
            print(slice)
            print("")

    features = torch.zeros(size=[len(slices), context_len, 300])
    labels = torch.zeros(size=[len(slices)]).long()
    for i, slice in enumerate(slices):
        last_token = slice[-1]
        labels[i] = word_to_id_dict[str(last_token)]
        encoding = encode_input_tokens(slice[:-1], context_len)
        features[i] = encoding
    
    dataset = CompletionDataset(features, labels)
    return dataset

def build_dataset_from_file(filename, context_len, slice_length, slice_offset, word_to_id_dict, print_slices):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
    return build_dataset_from_text(content, context_len, slice_length, slice_offset, word_to_id_dict, print_slices)

Initialize model. Output dict size is the size of the final layer.

In [25]:
dictionary_len = len(id_to_word)
context_len = 8
model = TransformerNetwork(num_layers=2, model_dim=300, att_heads=6, ff_hidden_dim=1200, context_len=context_len, output_dict_size=dictionary_len)
print(f"dictionary_len: {dictionary_len}")

word_predictor input dimension: 2400
output dimension: 3392
dictionary_len: 3392


Actually build the dataset:

In [26]:
max_slice_len = context_len + 1
dataset = build_dataset_from_file('../data/much_ado_about_nothing_gut.txt', context_len, max_slice_len, 1, word_to_id, False)
# dataset += build_dataset_from_file('../data/much_ado_about_nothing_gut.txt', context_len, max_slice_len // 4, max_slice_len * 2, word_to_id, False)
# dataset = build_dataset_from_file('../data/much_ado_about_nothing_gut.txt', context_len, max_slice_len, max_slice_len * 2, word_to_id, False)


Define model hyperparameters and set up data loader.

In [10]:
from torch.utils.data import DataLoader

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# sub_dataset = dataset[:32]
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=False)
# val_loader = DataLoader(dataset=val_dataset, batch_size=1, shuffle=False)

In [22]:
def train_one_epoch(do_validation: bool):
    model.train(True)
    torch.set_printoptions(profile="short")
    batches = 0
    avg_loss = 0
    for step, (features, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        preds = model(features)
        print(f"preds:{preds}\nlabels:{labels}")
        loss = loss_func(preds, labels)
        loss.backward()

        if step % 10 == 0:  # Print every 10 batches
            for name, param in model.named_parameters():
                if param.requires_grad:
                    print(f"Gradient data for {name}:", param.grad)
                    print(f"Checking if gradients are fully zeroed: {torch.all(param.grad == 0.0).item()}")
                    print(f"Shape: {param.grad.shape}")
                    print(f"Mean: {param.grad.mean()}")
                    print(f"Std: {param.grad.std()}")
                    print(f"Min: {param.grad.min()}")
                    print(f"Max: {param.grad.max()}")

        optimizer.step()

        print(f"Loss on batch {step}: {loss}")

        avg_loss += loss
        batches = step + 1
        break
    
    avg_loss = avg_loss / batches
    print(f"Average loss for training batches in this epoch: {avg_loss}")

    if do_validation:
        model.train(False)
        batches = 0
        avg_loss = 0
        for step, (features, labels) in enumerate(val_loader):
            preds = model(features)
            # print(f"preds:{preds}\nlabels:{labels}")
            loss = loss_func(preds, labels)
            
            print(f"Loss on step {step}: {loss}")

            avg_loss += loss
            batches = step + 1

        avg_loss = avg_loss / batches
        print(f"Average loss for validation batches in this epoch: {avg_loss}")


In [24]:
train_one_epoch(False)
# train_one_epoch()


Received x of shape: torch.Size([32, 64, 300])
Reshaped x to shape: torch.Size([32, 19200])
preds:tensor([[-0.37,  0.23,  ..., -0.39,  0.32],
        [-3.17, -0.43,  ..., -1.14,  1.83],
        ...,
        [12.15,  0.33,  ..., -3.39,  5.63],
        [-9.80,  1.16,  ..., -5.63,  7.86]], grad_fn=<AddmmBackward0>)
labels:tensor([2436, 1859, 3119, 2577, 2385,  710,  406, 3086, 2088,  163, 1048,  869,
        2392, 1223, 3065, 1241, 1494, 2944, 1241,  860,  601, 3176, 3025,  942,
         163,  894, 3235, 2186, 1241,  163, 2186, 3381])
Gradient data for word_predictor.weight: tensor([[-1.11e-42,  1.98e-42,  ...,  7.44e-43,  3.57e-42],
        [-2.04e-42,  3.62e-42,  ...,  1.36e-42,  6.53e-42],
        ...,
        [-1.10e-42,  1.95e-42,  ...,  7.31e-43,  3.51e-42],
        [-2.22e-42,  3.94e-42,  ...,  1.48e-42,  7.09e-42]])
Checking if gradients are fully zeroed: False
Shape: torch.Size([3392, 19200])
Mean: -4.685899563337814e-13
Std: 0.0010360482847318053
Min: -0.23651960492134094
Max: 0

In [13]:
def infer_completion(input_text: str, context_len):
    encoded_input = encode_inputs([input_text], context_len)
    
    model.train(False)
    pred = model(encoded_input)
    return id_to_word[torch.argmax(softmax(pred, dim=1), dim=1).item()]

In [14]:
infer_completion("Well, I am glad that all things sort so well Pedro", context_len)

Received x of shape: torch.Size([1, 64, 300])
Reshaped x to shape: torch.Size([1, 19200])


'twas'

In [15]:
param.grad

NameError: name 'param' is not defined