## Transformer Architecture (Decoder)
Building a transformer from Pytorch's layers, based on: https://github.com/StatQuest/decoder_transformer_from_scratch.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

### Building a Tokenizer

In [3]:
class Tokenizer():
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_reverse = dict(zip(vocab.values(), vocab.keys()))

    def __call__(self, sequence):
        token_ids = list(map(lambda token: self.vocab[token], sequence.split()))
        return token_ids
    
    def decode(self, sequence):
        tokens = ' '.join(list(map(lambda id: self.vocab_reverse[id], sequence)))
        return tokens

tokenizer = Tokenizer({'What': 0, 'is': 1, 'the': 2, 'capital': 3, 'of': 4, 'France': 5, 'Paris': 6, '<EOS>': 7})

In [10]:
prompt = 'What is the capital of France <EOS>'

prompt_ids = tokenizer(prompt)
sentence = tokenizer.decode(prompt_ids)

print(prompt_ids, sentence, sep='\n')

[0, 1, 2, 3, 4, 5, 7]
What is the capital of France <EOS>


### Creating the dataset

In [None]:
response_decoder = 'is the capital of France <EOS> Paris <EOS>'
response_ids = tokenizer(response_decoder)
dataset = TensorDataset(prompt_ids, response_decoder)
data_loader = DataLoader(dataset)

### Creating the positional encoding

In [13]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model=2, max_len=6):
        super().__init__()
        pos_encoding = torch.zeros(max_len, d_model)

        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_idx = torch.arange(start=0, end=d_model, step=2).float()

        pos_encoding[:, 0::2] = torch.sin(position / (torch.tensor(10000.0)**(embedding_idx / d_model)))
        pos_encoding[:, 1::2] = torch.cos(position / (torch.tensor(10000.0)**(embedding_idx / d_model)))

        self.register_buffer('pos_encoding', pos_encoding) # To ensure it is in the GPU

    def forward(self, word_embeddings):
        return word_embeddings + self.pos_encoding[:word_embeddings.size(0), :]


### Creating the attention layers

In [None]:
class Attention(nn.module):
    def __init__(self, d_model=2):
        super().__init__()
        
        self.w_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.w_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.w_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings_q, encodings_k, encodings_v, mask=None):
        q = self.w_q(encodings_q)
        k = self.w_k(encodings_k)
        v = self.w_v(encodings_v)

        q_kt = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        q_kt = q_kt / torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            q_kt = q_kt.masked_fill(mask, value=-1e9)

        attention_scores = torch.matmul(F.softmax(q_kt, dim=self.col_dim), v)

        return attention_scores


In [None]:
class DecoderTransformer()