In [1]:
!pip install tiktoken




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import tiktoken

In [3]:
enc = tiktoken.get_encoding("o200k_base")

In [4]:
enc.encode("Hello World.")

[13225, 5922, 13]

In [6]:
enc.n_vocab

200019

In [2]:
import torch

In [3]:
class Tockenizer:
    def __init__(self,model="o200k_base"):
        self.encoder = tiktoken.get_encoding(model)
        self.n_vocab = self.encoder.n_vocab
        
    def encode(self,sentence:str):
        return self.encoder.encode(sentence)
    
    def decode(self,tok):
        return self.encoder.decode(tok)

In [5]:
inputs=[
    "Hello, how are you?",
    "The quick brown fox jumps over the lazy dog."
]

tokenizer = Tockenizer('gpt2')

In [7]:
tokenizer.encode(inputs[0]),tokenizer.encode(inputs[1])

([15496, 11, 703, 389, 345, 30],
 [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13])

## Data Sampling:

In [9]:
enc_sample = tokenizer.encode(inputs[1])
enc_sample

[464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]

In [10]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("x:",x)
print("y:\t",y)

x: [464, 2068, 7586, 21831]
y:	 [2068, 7586, 21831, 18045]


In [11]:
from torch.utils.data import Dataset, DataLoader

### LLM Dataset Class

In [42]:
class LLMDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-max_length, stride):
            inp_chk = token_ids[i: i+max_length]
            target_chk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.Tensor(inp_chk).to(dtype=torch.int32))
            self.target_ids.append(torch.Tensor(target_chk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx],self.target_ids[idx]

In [43]:
text = """
        Narendra Damodardas Modi (born 17 September 1950) is an Indian politician who has served as the prime minister of India since 2014.
        Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi. He is a member of the Bharatiya 
        Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a far-right Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister outside the Indian National Congress.
        Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at the age of eight. At the age of 18, he was married to Jashodaben Modi, 
        whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so. Modi became a full-time worker for the RSS in Gujarat in 1971.
        The RSS assigned him to the BJP in 1985 and he rose through the party hierarchy, becoming general secretary in 1998. In 2001, Modi was appointed chief minister of Gujarat and elected to the legislative assembly soon after. 
        His administration is considered complicit in the 2002 Gujarat riots,[d] and has been criticised for its management of the crisis. According to official records, a little over 1,000 people were killed, three-quarters of whom were Muslim; 
        independent sources estimated 2,000 deaths, mostly Muslim.[13] A Special Investigation Team appointed by the Supreme Court of India in 2012 found no evidence to initiate prosecution proceedings against him.
        While his policies as chief minister were credited for encouraging economic growth, his administration was criticised for failing to significantly improve health, poverty and education indices in the state.
"""

In [44]:
torch.__version__

'2.6.0+cpu'

In [45]:
tokenizer = tiktoken.get_encoding("gpt2")
dataset = LLMDataset(text=text, tokenizer=tokenizer, max_length=6,stride=2)

In [46]:
len(dataset)

212

In [47]:
print("input:",dataset[0][0])
print("target:\t",dataset[0][1])

input: tensor([198, 220, 220, 220, 220, 220], dtype=torch.int32)
target:	 tensor([220., 220., 220., 220., 220., 220.])


### Create Data Loader Function

In [48]:
def create_data_loader(text, tokenizer, batch_size=64, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    dataset = LLMDataset(text=text, tokenizer=tokenizer, max_length=6,stride=2)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [55]:
max_length = 6
dataloader = create_data_loader(text=text,tokenizer=tokenizer,max_length=max_length,stride=2,shuffle=False,batch_size=4)

data_itr = iter(dataloader)
first_batch = next(data_itr)
inputs, targets = first_batch
print("Inputs:")
print(inputs)
print("Targets:")
print(targets)

Inputs:
tensor([[  198,   220,   220,   220,   220,   220],
        [  220,   220,   220,   220,   220,   220],
        [  220,   220,   220,   220, 28113,  5245],
        [  220,   220, 28113,  5245,   375,   446]], dtype=torch.int32)
Targets:
tensor([[  220.,   220.,   220.,   220.,   220.,   220.],
        [  220.,   220.,   220.,   220.,   220., 28113.],
        [  220.,   220.,   220., 28113.,  5245.,   375.],
        [  220., 28113.,  5245.,   375.,   446.,   292.]])


## Embedding Layers:

In [56]:
vocab_size = tokenizer.n_vocab
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [57]:
print(embedding_layer.weight.shape)

torch.Size([50257, 256])


In [58]:
inputs.shape

torch.Size([4, 6])

In [59]:
emb = embedding_layer(inputs)
emb.shape

torch.Size([4, 6, 256])

### Embeddings with positional information

In [60]:
max_length

6

In [65]:
vocab_size = tokenizer.n_vocab
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [68]:
max_length = 6


dataloader = create_data_loader(text=text,tokenizer=tokenizer,max_length=max_length,stride=2,shuffle=False,batch_size=4)

data_itr = iter(dataloader)
first_batch = next(data_itr)
inputs, targets = first_batch
print("Inputs:")
print(inputs)
print("Inputs shape:")
print(inputs.shape)

Inputs:
tensor([[  198,   220,   220,   220,   220,   220],
        [  220,   220,   220,   220,   220,   220],
        [  220,   220,   220,   220, 28113,  5245],
        [  220,   220, 28113,  5245,   375,   446]], dtype=torch.int32)
Inputs shape:
torch.Size([4, 6])


In [70]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([4, 6, 256])


In [71]:
context_length = max_length

pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)

In [73]:
pos_embedding_layer.weight.shape

torch.Size([6, 256])

In [74]:
positional_embeddings = pos_embedding_layer(torch.arange(max_length))
print(positional_embeddings)

tensor([[ 1.4086, -0.9019, -0.8384,  ...,  1.0736,  1.6007, -1.4017],
        [ 0.4155,  1.6523,  0.0712,  ..., -0.5711, -1.4070, -0.5012],
        [ 0.6352, -1.5864, -1.2704,  ..., -0.6178, -0.4244,  0.1902],
        [-1.7419, -1.8249,  0.6606,  ..., -0.4548, -0.2138, -0.4476],
        [-0.2468,  0.2067,  0.0863,  ...,  1.8337, -0.7942, -0.0705],
        [-1.8092, -1.4747, -1.0397,  ...,  1.8867, -0.0478,  1.1137]],
       grad_fn=<EmbeddingBackward0>)


In [75]:
input_embeddings = token_embeddings+positional_embeddings

In [76]:
input_embeddings.shape

torch.Size([4, 6, 256])

In [77]:
token_embeddings

tensor([[[ 0.5532, -0.2105, -1.1234,  ...,  0.5953, -0.1279, -1.4442],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439]],

        [[ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439]],

        [[ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
         [ 1.9156, -1.1700,  1.8956,  ..., -1.0003, -0.2094,  1.2439],
  

In [78]:
positional_embeddings

tensor([[ 1.4086, -0.9019, -0.8384,  ...,  1.0736,  1.6007, -1.4017],
        [ 0.4155,  1.6523,  0.0712,  ..., -0.5711, -1.4070, -0.5012],
        [ 0.6352, -1.5864, -1.2704,  ..., -0.6178, -0.4244,  0.1902],
        [-1.7419, -1.8249,  0.6606,  ..., -0.4548, -0.2138, -0.4476],
        [-0.2468,  0.2067,  0.0863,  ...,  1.8337, -0.7942, -0.0705],
        [-1.8092, -1.4747, -1.0397,  ...,  1.8867, -0.0478,  1.1137]],
       grad_fn=<EmbeddingBackward0>)

In [79]:
input_embeddings

tensor([[[ 1.9618, -1.1125, -1.9619,  ...,  1.6689,  1.4728, -2.8459],
         [ 2.3311,  0.4823,  1.9668,  ..., -1.5714, -1.6164,  0.7428],
         [ 2.5507, -2.7564,  0.6252,  ..., -1.6181, -0.6338,  1.4341],
         [ 0.1736, -2.9949,  2.5562,  ..., -1.4551, -0.4231,  0.7963],
         [ 1.6687, -0.9633,  1.9819,  ...,  0.8334, -1.0036,  1.1734],
         [ 0.1064, -2.6446,  0.8559,  ...,  0.8864, -0.2571,  2.3576]],

        [[ 3.3241, -2.0719,  1.0572,  ...,  0.0733,  1.3913, -0.1578],
         [ 2.3311,  0.4823,  1.9668,  ..., -1.5714, -1.6164,  0.7428],
         [ 2.5507, -2.7564,  0.6252,  ..., -1.6181, -0.6338,  1.4341],
         [ 0.1736, -2.9949,  2.5562,  ..., -1.4551, -0.4231,  0.7963],
         [ 1.6687, -0.9633,  1.9819,  ...,  0.8334, -1.0036,  1.1734],
         [ 0.1064, -2.6446,  0.8559,  ...,  0.8864, -0.2571,  2.3576]],

        [[ 3.3241, -2.0719,  1.0572,  ...,  0.0733,  1.3913, -0.1578],
         [ 2.3311,  0.4823,  1.9668,  ..., -1.5714, -1.6164,  0.7428],
  