# Build a mini-mini-GPT

# 1. Tokenization

#### I will be using a pre-trained tokenizer here from gpt2 model.

In [2]:
from transformers import GPT2TokenizerFast
from tokenizers.processors import TemplateProcessing

# loading the pretrained tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Update the tokenizer to add EOS(End-of-Sentence) token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single=f"$A:0 {tokenizer.eos_token}:0",                     # Pattern for single sentences: "Text + EOS"
    pair=f"$A:0 {tokenizer.eos_token}:0 $B:1 {tokenizer.eos_token}:1",   # Pattern for pairs
    special_tokens= [
        (tokenizer.eos_token, tokenizer.eos_token_id)
    ]
)

# Here I will add PAD Tokens, since gpt2 does not have them by default
tokenizer.add_special_tokens({"pad_token":"[PAD]"})
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("[PAD]")

print(f"Tokenizer vocab size: {len(tokenizer)}")


  from .autonotebook import tqdm as notebook_tqdm


Tokenizer vocab size: 50258


Lets put tokenizing at test

Encoding

In [7]:
input = "Hello, World!"

tokens = tokenizer.encode(input, add_special_tokens=False)

print(f"The tokens for 'Hello, World!' are: {tokens}")

The tokens for 'Hello, World!' are: [15496, 11, 2159, 0]


Decoding list of token ids: [15496, 11, 2159, 0]

In [5]:
decoded_text = tokenizer.decode(tokens)

print(f"Decoded sequence: {decoded_text}")

Decoded sequence: Hello, World!


We pass texts through the model when training in batches and not one sample at a time. The problem is, that different sample texts might have different lengths, which leads to different number of tokens and again leads to different length of lists containing this tokens. 
The problem is that tensors have fixed shapes! 

In [12]:
batch_text = ["Hello, I am Nik.", "Tokenizing is not as easy as I initially thought!"]

tokens = tokenizer.encode(batch_text)
print(f"Token IDs list: {tokens}")
print(f"Number of tokens for the first sentence: {len(tokens[0])}")
print(f"Number of tokens for the second sentence: {len(tokens[1])}")

Token IDs list: [[15496, 11, 314, 716, 11271, 13, 50256], [30642, 2890, 318, 407, 355, 2562, 355, 314, 7317, 1807, 0, 50256]]
Number of tokens for the first sentence: 7
Number of tokens for the second sentence: 12


As mentioned above, converting the list above to a PyTorch Tensor would throw an error. Thats why I am going to use padding and truncation. 
Padding means that we add a token called padding to the shortest list of tokens, so as to make the lists of the same length.
Truncation means that we remove the elements from the longer sequence so that it matches the shorter sequence.

In [13]:
batch_text = ["Hello, I am Nik.", "Tokenizing is not as easy as I initially thought!"]

tokens = tokenizer.encode(batch_text, return_tensors="pt", padding=True, truncation=True, max_length=6)
print(f"Token IDs list: {tokens}")
print(f"Number of tokens for the first sentence: {len(tokens[0])}")
print(f"Number of tokens for the second sentence: {len(tokens[1])}")

Token IDs list: tensor([[15496,    11,   314,   716, 11271, 50256],
        [30642,  2890,   318,   407,   355, 50256]])
Number of tokens for the first sentence: 6
Number of tokens for the second sentence: 6


#### We are now done with our tokenization part of the process

# Token Embeddings

This following codecell initializes the embedding layer for token embeddings. 
Arguments:
   - vocab_size - Our token vocabulary size.
   - model_dim - Number of dimensions, meaning how many entries are our vectors going to have, to represent each token. 

In [15]:
import torch
from torch import nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, model_dim):
        super().__init__()
    
        self.token_embeddings = nn.Embedding(vocab_size, model_dim)
    
    def forward(self, x):
        return self.token_embeddings(x)

# Positional embeddings

This following codecell initializes the embedding layer for positional embeddings. 
Arguments:
   - max_length - The number of tokens we predefine.
   - model_dim - Number of dimensions, meaning how many entries are our vectors going to have, to represent each token. 

In [30]:
class PositionalEmbeddings(nn.Module):
    def __init__(self, max_length, model_dim):
        super().__init__()
        self.positional_embeddings = nn.Embedding(max_length, model_dim)

    def forward(self, x):

        seq_length = x.shape[1]
        positional_embeddings = self.positional_embeddings(torch.arange(seq_length, device=DEVICE))

        positional_embeddings = positional_embeddings.unsqueeze(0)
        return positional_embeddings

In [32]:
input = "I am Fisnik and this is my first Transformer!"

DEVICE="cpu"
# Set vocab size, model dimension and max_length
vocab_size = len(tokenizer)
model_dim = 256
max_length = 512

# Tokenize
tokens = tokenizer.encode(input, max_length=max_length, return_tensors="pt", padding="max_length", truncation=True)


# Initialize token embedding layer
embedding_layer = TokenEmbedding(vocab_size=vocab_size, model_dim=model_dim)

# Initialize the positional embedding layer
positional_layer = PositionalEmbeddings(max_length=max_length, model_dim=model_dim)

# Get the token embeddings
token_embeddings = embedding_layer(tokens)

# Get the positional embeddings
positional_embeddings = positional_layer(tokens)

print(f"Shape of tensor for token embedding: {token_embeddings.shape}")
print(f"Shape of tensor for positional embedding: {positional_embeddings.shape}")

Shape of tensor for token embedding: torch.Size([1, 512, 256])
Shape of tensor for positional embedding: torch.Size([1, 512, 256])


This is an actual 3-dim matrix so it has size [1, 512, 256]. 
Dimensions:
    - 1 stands there because we only feed one input sequence in the embedding layer.
    - 512 is the number of tokens for the sequence (mostly paddings since the sentence itself is like 15 tokens long).
    - 256 is the number of entries for each token embedding.



### Building the embedding layer which contains token and positional embedding.


In [None]:
class EmbeddingLayer(nn.Module):

    def __init__(self, vocab_size, model_dim, max_length):
        super().__init__()
        self.positional_embeddings = PositionalEmbeddings(max_length=max_length, model_dim=model_dim)
        self.token_embeddings = TokenEmbedding(vocab_size=vocab_size, model_dim=model_dim)

        # We add a Layer here to normalize our tensor, otherwise it is not stable
        self.layer_norm = nn.LayerNorm(model_dim)

    def forward(self, x):
        token_embeddings = self.token_embeddings(x)
        positional_embeddings = self.positional_embeddings(x)

        embeddings = token_embeddings + positional_embeddings
        embeddings = self.layer_norm(embeddings)
        return embeddings

# Decoder 