In [None]:
## Embedding to capture positional data of the token in the sequence improving contextual knowledge for LLM
## Without this, the same token appearing at different places in the sentence will have the same vector

In [None]:
## 1. Absolute: For each position in the input, a unique embedding is added to the token embedding to convey its location. The positional embedding has the same dimension as the token embedding. So input embedding. = token embedding + positional embedding

## 2. Relative: Emphasis is on relative positioning/distance of the tokens. The model learns 'how far apart' the tokens are rather than the exact position. So the advantage is the model can generalize to varying sequence lengths.

In [2]:
## The positional embeddings are also optimized during training with backprop

In [None]:
## Sample vocab size and embedding dimension d
vocab_size = 50257 # Size of tiktoken vocab
output_dim = 256

In [5]:
import torch

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embedding_layer

Embedding(50257, 256)

In [6]:
sample_text = """The old wizard lived in a tall tower. Every morning he would wake up early and look out his window. 
    From his window he could see the entire village below. The village was small but busy. 
    People walked through the streets carrying baskets. The baskets were filled with fresh bread and fruit.
    
    One day the wizard noticed something strange. A large dragon was flying toward the village. 
    The dragon was enormous and had bright red scales. The wizard knew he had to act quickly.
    He grabbed his magic wand from the wooden table. The wand was old but very powerful.
    
    The wizard pointed the wand at the dragon and spoke a magic spell. The spell created a bright light.
    The light surrounded the dragon and made it disappear. The village was safe once again.
    The people in the village cheered and thanked the brave wizard.
    
    After the adventure the wizard returned to his tower. He was tired but happy. 
    He had protected the village and its people. The wizard knew that tomorrow might bring new challenges.
    But for now he could rest peacefully in his tall tower."""

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

## Taken from Data Preprocessing directory
class TextDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = [] 

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
import tiktoken

## Taken from Data Preprocessing directory
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = TextDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size, 
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [11]:
## Using a batch size of 8 and context window of 4 from the dataloader, the input size after embedding will be 8 x 4 x 256 (b x n x d)
max_length = 4
dataloader = create_dataloader_v1(
    sample_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  464,  1468, 18731,  5615],
        [  287,   257,  7331, 10580],
        [   13,  3887,  3329,   339],
        [  561,  7765,   510,  1903],
        [  290,   804,   503,   465],
        [ 4324,    13,   220,   198],
        [  220,   220,   220,  3574],
        [  465,  4324,   339,   714]])

Inputs shape:
 torch.Size([8, 4])


In [12]:
## Converting token IDs in the input to embeddings
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
## For absolute positional encoding, we create a positional embedding of same dimension d as the token embedding
## Since the context size is 4, the input will have a maximum of 4 tokens, so we need to encode only these 4 positions so shape is context size x d
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embedding_layer

Embedding(4, 256)

In [14]:
## Getting the embeddings from the matrix by looking up each of the positions from 0 to context size
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [15]:
## These positional embeddings need to be added to every input in the batch
input_embeddings = token_embeddings + pos_embeddings # Broadcasting
print(input_embeddings.shape)

torch.Size([8, 4, 256])
