In [75]:
import torch
from torch import tensor, sin, cos
from math import sqrt
from torch.nn.functional import softmax
import spacy
from torchtext.vocab import GloVe

glove = GloVe(dim=300)

def par_attention(queries: tensor, keys: tensor, values: tensor, dim: int) -> tensor:
    raw_weights = torch.bmm(queries, keys.transpose(1, 2))

    mask = torch.tril(torch.ones_like(raw_weights), diagonal=0)
    raw_weights = raw_weights.masked_fill(mask == 0, float('-inf'))
    print(f"raw_weights.shape:{raw_weights.shape}\nraw_weights: {raw_weights}")

    scale_factor = sqrt(dim)
    scaled_weights = softmax(raw_weights / scale_factor, dim=2)
    print(f"scaled_weights.shape:{scaled_weights.shape}\nscaled_weights: {scaled_weights}")

    # now scaled weights is a matrix where each row represents the scaled weights produced based on a given query.
    # meanwhile values just has a value vector on each row.

    reshaped_scaled_weights = scaled_weights.view(scaled_weights.shape[0], scaled_weights.shape[1], scaled_weights.shape[2], 1)
    reshaped_values = values.view(1, values.shape[0], values.shape[1], values.shape[2])

    scaled_values = reshaped_scaled_weights * reshaped_values

    contextualized_values = torch.sum(scaled_values, 2)
    return contextualized_values

def build_dictionary(file_path) -> (dict, dict):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    tokenizer = spacy.load("en_core_web_sm")
    tokens = tokenizer(content)

    unique_words = set(tokens)
    word_to_id = {str(word): i for i, word in enumerate(unique_words)}
    id_to_word = {i: str(word) for i, word in enumerate(unique_words)}
    return word_to_id, id_to_word

def positional_embedding(word, pos) -> tensor:
    model_dims = 300

    positional_encoding = torch.tensor([0.0] * model_dims)
    for i in range(0, model_dims // 2):
        positional_encoding[2 * i] = sin(torch.tensor(pos / (10000 ** (2 * i / model_dims))))
        positional_encoding[2 * i + 1] = cos(torch.tensor(pos / (10000 ** (2 * i / model_dims))))

    embedding = glove[word]
    embedding += positional_encoding
    return embedding


def encode_input_string(str, context_len) -> tensor:
    tokenizer = spacy.load("en_core_web_sm")
    tokens = tokenizer(str)

    output = torch.zeros(size=[context_len, 300])
    for i, token in enumerate(tokens):
        output[i] = positional_embedding(token.text, i)

    return output

Goal for this notebook is to implement an attention block, and then to build out the rest of the transformer architecture. I'll start with a single-headed version, then build out multi-headedness (which shouldn't be a lot of additional work, I think). The original paper uses a model dimensionality of 512 and 8 heads, which each work on 512 / 8 = 64 dimensions. In my case my embedding function produces 300 dimension vectors, so I think I'll experiment with using 6 heads with 50 dimensions each.

An attention head includes:
1. Separate, learned linear projections for Q, K, V vectors. I think this is basically just a feed forward layer without a nonlinearity?
In any case, this linear projection reduces the dimensionality of the input 
2. The scaled dot product attention function.

Then, a multi-head attention block contains:
1. Some number of attention heads.
2. A concatenation step. This just takes the model_dim / h length vectors that are output from the attention head and concatenates them.
3. A learned linear projection. My intuition is that this projection "blends" the h concatenated vectors into a more meaningful and cohesive whole.

The whole transformer layer / block varies from encode, to decode, to decode-only. In the original paper's decode block, a transformer layer includes two separate attention blocks, one of which allows for queries to be drawn from the previous decoder layer while values and keys are drawn from the encoder. In a decode-only architecture I don't think there's any meaningful or useful analogy for this, so instead I'll be using only a single attention block per transformer layer.

With all that said, my transformer layers will include:
1. Masked multi-head attention - i.e. the attention block outlined above. The input is just some number of model_dim length embedding vectors, which comes from either the previous transformer layer or, for the first transformer layer, the positional embedding function. These are differentiated into Q, K, V vectors by the linear projections in the attention heads.
2. Residual connection defined as: LayerNorm(x + Sublayer(x))
3. Feed forward block. This is two feed forward layers, with a single ReLU in between. In the paper, the layers share a hidden dimension which is four times larger than the model dimension. I'll experiment with something similar.
4. Another residual connection.

The complete architecture:
1. Positional encoding function applied to input tokens/words. (update: in this case I'm not training any aspect of this as part of my model, so I think it makes sense to keep it outside of the model itself.)
2. Sequential transformer layers. In the paper, there are 6.
3. A linear layer that takes in all the vectors output by the final transformer layer, and has outputs for each possible next word.
4. Softmax function over outputs gives us probabilities for next word, the final output of the network.

Note to self: It might be good to build an API that lets the user specify a custom/different embedding function, but that's probably not a priority.

In [57]:
import torch.nn as nn

class AttentionHead(nn.Module):
    # For simplicity, I assume query, key, and value vectors have the same dimensionality
    def __init__(self, model_dim, vectors_dim):
        super().__init__()
        self.model_dim = model_dim
        self.vectors_dim = vectors_dim
        self.Q_proj = nn.Linear(model_dim, vectors_dim)
        self.K_proj = nn.Linear(model_dim, vectors_dim)
        self.V_proj = nn.Linear(model_dim, vectors_dim)

    def forward(self, x):
        # each row of x is a vector representing the meaning of the token at the corresponding position with whatever context we've attained so far.
        Q = self.Q_proj(x)
        K = self.K_proj(x)
        V = self.V_proj(x)
        print("Shape of Q matrix: ", Q.shape)
        print("Shape of K matrix: ", K.shape)
        print("Shape of V matrix: ", V.shape)
        output = par_attention(Q, K, V, self.vectors_dim)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super().__init__()
        self.att_heads = nn.ModuleList([AttentionHead(model_dim, model_dim // num_heads) for _ in range(num_heads)])
        self.proj = nn.Linear(model_dim, model_dim)

    def forward(self, x):
        head_outputs = [head(x) for head in self.att_heads]
        x = torch.concat(head_outputs, dim=2)
        x = self.proj(x)
        return x
        
class TransformerLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_hidden_dim, context_len):
        super().__init__()
        self.attention_block = MultiHeadAttention(model_dim, num_heads)
        self.norm1 = nn.LayerNorm(normalized_shape=[context_len, model_dim])
        self.ff1 = nn.Linear(model_dim, ff_hidden_dim)
        self.ff_relu = nn.ReLU()
        self.ff2 = nn.Linear(ff_hidden_dim, model_dim)
        self.norm2 = nn.LayerNorm(normalized_shape=[context_len, model_dim])

    def forward(self, x):
        x_res = x
        x = self.attention_block(x)
        x += x_res
        x = self.norm1(x)

        x_res = x
        x = self.ff1(x)
        x = self.ff_relu(x)
        x = self.ff2(x)
        x += x_res
        x = self.norm2(x)

        return x


In [58]:

class TransformerNetwork(nn.Module):
    def __init__(self, num_layers, model_dim, att_heads, ff_hidden_dim, context_len, output_dict_size):
        super().__init__()
        self.trans_layers = nn.ModuleList([TransformerLayer(model_dim, att_heads, ff_hidden_dim, context_len) for _ in range(num_layers)])
        self.word_predictor = nn.Linear(model_dim * context_len, output_dict_size)
        print("model_dim * context_len = ", model_dim * context_len)

    def forward(self, x):
        for layer in self.trans_layers:
            x = layer.forward(x)
        print("Shape of x before view: ", x.shape)
        x = x.view(x.shape[0], -1)
        print("Shape of x after view: ", x.shape)
        x = self.word_predictor(x)
        return x



There's a lot of complexity I still need to figure out in terms of how I need to package this model. That includes the correct use of my input encoding functions, use of the output dictionary, and probably some debugging of the model itself. Gonna try to get the model to make a random (untrained) prediction on some dummy text.

In [59]:
# These parameters match what's described in "attention is all you need". Not sure what their dictionary structure/size is though.
# Also not sure how they handle context length...
# paper_model = TransformerNetwork(num_layers=6, model_dim=512, att_heads=8, ff_hidden_dim=2048, context_len=256, output_dict_size=1)

word_to_id, id_to_word = build_dictionary('../data/much_ado_about_nothing_gut.txt')
dictionary_len = len(id_to_word)
context_len = 256
basic_model = TransformerNetwork(num_layers=1, model_dim=300, att_heads=6, ff_hidden_dim=1200, context_len=context_len, output_dict_size=dictionary_len)

test_input = "The next word is"
encoded_input = encode_input_string(test_input, context_len)



model_dim * context_len =  76800


In [60]:
preds = basic_model(encoded_input.unsqueeze(0))

Shape of Q matrix:  torch.Size([1, 256, 50])
Shape of K matrix:  torch.Size([1, 256, 50])
Shape of V matrix:  torch.Size([1, 256, 50])
raw_weights.shape:torch.Size([1, 256, 256])
raw_weights: tensor([[[ 0.0903,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
         [ 0.4369,  0.5904,    -inf,  ...,    -inf,    -inf,    -inf],
         [-0.2810, -0.3996,  0.3438,  ...,    -inf,    -inf,    -inf],
         ...,
         [-0.1713, -0.1643, -0.1276,  ...,  0.0187,    -inf,    -inf],
         [-0.1713, -0.1643, -0.1276,  ...,  0.0187,  0.0187,    -inf],
         [-0.1713, -0.1643, -0.1276,  ...,  0.0187,  0.0187,  0.0187]]],
       grad_fn=<MaskedFillBackward0>)
scaled_weights.shape:torch.Size([1, 256, 256])
scaled_weights: tensor([[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.4946, 0.5054, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.3251, 0.3197, 0.3552,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0038, 0.0038, 0.0039,  ..., 0.0039, 0.0000, 

In [81]:
print(preds)
print(preds.shape)
next_token_batched = torch.argmax(softmax(preds, dim=1), dim=1)
next_token = next_token_batched[0].item()
print("Behold!!! The CHOSEN SENTENCE will now be COMPLETED!!! OBSEEEEEEEERVE!!!!!!!")
print(test_input + id_to_word[next_token])

tensor([[-0.1929,  0.1562,  0.2769,  ...,  0.7094,  0.2269,  0.4125]],
       grad_fn=<AddmmBackward0>)
torch.Size([1, 32167])
Behold!!! The CHOSEN SENTENCE will now be COMPLETED!!! OBSEEEEEEEERVE!!!!!!!
The next word is



Bad news, the next word was a fucking newline character this whole time. I feel like that's actually kinda inauspicious 😭

In [86]:
print([id_to_word[next_token]])
print([id_to_word[next_token - 1]])
print([id_to_word[next_token + 1]])

['\n']
['punishment']
['faith']
