Reading in a story as text:


In [2]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    text = f.read()

print(len(text))
print(text[:3])  # Print the first 1000 characters to verify content

20479
I H


Splitting the text using punctuation and spaces:

In [3]:
import re
preprocessed = re.split(r'([,.?!;"]|--|\s)', text)
preprocessed = [t for t in preprocessed if t.strip() != '']
vocabset = sorted(set(preprocessed))
vocabset+=["<|endoftext|>","<|unk|>"]
vocab={ch:i for i,ch in enumerate(vocabset)}
print(vocab)
print(len(vocab))



{'!': 0, '"': 1, "'": 2, "'Are": 3, "'It's": 4, "'coming'": 5, "'done'": 6, "'subject": 7, "'technique'": 8, "'way": 9, '(I': 10, '(Though': 11, ')': 12, ',': 13, '--': 14, '.': 15, ':': 16, ';': 17, '?': 18, 'A': 19, 'Ah': 20, 'Among': 21, 'And': 22, 'Arrt': 23, 'As': 24, 'At': 25, 'Be': 26, 'Begin': 27, 'Burlington': 28, 'But': 29, 'By': 30, 'Carlo': 31, 'Chicago': 32, 'Claude': 33, 'Come': 34, 'Croft': 35, 'Croft)': 36, 'Destroyed': 37, 'Devonshire': 38, "Don't": 39, 'Dubarry_': 40, 'Emperors': 41, 'Florence': 42, 'For': 43, 'Gallery': 44, 'Gideon': 45, 'Gisburn': 46, "Gisburn's": 47, 'Gisburns': 48, 'Grafton': 49, 'Greek': 50, 'Grindle': 51, "Grindle's": 52, 'Grindle:': 53, 'Grindles': 54, 'HAD': 55, 'Had': 56, 'Hang': 57, 'Has': 58, 'He': 59, 'Her': 60, 'Hermia': 61, "Hermia's": 62, 'His': 63, 'How': 64, 'I': 65, "I'd": 66, "I'll": 67, "I've": 68, 'If': 69, 'In': 70, 'It': 71, "It's": 72, 'Jack': 73, "Jack's": 74, 'Jove': 75, 'Just': 76, 'Lord': 77, 'Made': 78, 'Miss': 79, "Money'

Implementing a tokenizer with/without special context tokens, split on words:

In [4]:
#Without special context tokens
class simpletokenizer:
    def __init__(self,vocab):
        self.word2idx = vocab
        self.idx2word = {i:w for i,w in enumerate(vocab)}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.?!;"]|--|\s)', text)
        preprocessed = [t for t in preprocessed if t.strip() != '']
        result = [self.word2idx[i] for i in preprocessed] 
        return result
    
    def decode(self,token_ids):
        result = [self.idx2word[i] for i in token_ids]
        answer = ' '.join(result)
        answer = re.sub(r'\s([,.?!;"]|--)', r'\1', answer)
    
        return answer

In [5]:
test = "Hello I always rather cheap."
print(test)
obj=simpletokenizer(vocab)

encoded = obj.encode(test)
print(obj.decode(encoded))

Hello I always rather cheap.


KeyError: 'Hello'

In [6]:
#With special context tokens
class simpletokenizerv2:
    def __init__(self,vocab):
        self.word2idx = vocab
        self.idx2word = {i:w for i,w in enumerate(vocab)}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.?!;"]|--|\s)', text)
        preprocessed = [t for t in preprocessed if t.strip() != '']
        for i in range(len(preprocessed)):
            if preprocessed[i] not in self.word2idx:
                preprocessed[i] = "<|unk|>"
        print(preprocessed)
        result = [self.word2idx[i] for i in preprocessed] 
        return result
    
    def decode(self,token_ids):
        result = [self.idx2word[i] for i in token_ids]
        answer = ' '.join(result)
        answer = re.sub(r'\s([,.?!;"]|--)', r'\1', answer)
    
        return answer

In [7]:
test1 = "Hello, I always rather cheap."
test2 = "Hello, I always rather cheap.how are you?"
test = ' <|endoftext|> '.join([test1,test2])
print(test)
obj=simpletokenizerv2(vocab)

encoded = obj.encode(test)
print(encoded)
print(obj.decode(encoded))

Hello, I always rather cheap. <|endoftext|> Hello, I always rather cheap.how are you?
['<|unk|>', ',', 'I', 'always', 'rather', 'cheap', '.', '<|endoftext|>', '<|unk|>', ',', 'I', 'always', 'rather', 'cheap', '.', 'how', 'are', 'you', '?']
[1181, 13, 65, 184, 862, 290, 15, 1180, 1181, 13, 65, 184, 862, 290, 15, 596, 203, 1174, 18]
<|unk|>, I always rather cheap. <|endoftext|> <|unk|>, I always rather cheap. how are you?


Implementing Byte Pair encoding:

In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
text = ("Hey, how are you? <|endoftext|> I am fine, thankyou! <|endoftext|> ")
int = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(int)
strs = tokenizer.decode(int)
print(strs)

[10814, 11, 703, 389, 345, 30, 220, 50256, 314, 716, 3734, 11, 621, 2584, 280, 0, 220, 50256, 220]
Hey, how are you? <|endoftext|> I am fine, thankyou! <|endoftext|> 


<b>
Implementing input-output pairs:

In [10]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    text = f.read()
int = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(int)
strs = tokenizer.decode(int)
print(tokenizer.decode(int[:1]))
context_len = 4
print(int[:context_len]," -> ",int[1:context_len+1])
for i in range(1,context_len+1):
    print(int[:i], "->", int[i])
for i in range(1,context_len+1):
    print(tokenizer.decode(int[:i]), "->", tokenizer.decode([int[i]]))

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13079, 410, 12523, 286, 

<b>Implementing a data loader:

In [11]:
from torch.utils.data import Dataset, DataLoader
import torch
class LLMdataset(Dataset):
    def __init__(self, text,tokenizer, context_len, stride=1):
        self.tokenizer = tokenizer
        self.context_len = context_len
        self.stride = stride
        
        #Tokenixze the entire text
        self.data = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
        print(f"Data has {len(self.data)} tokens")
        
        #Create input-output pairs
        self.inputs = []
        self.outputs = []   
        for i in range(0, len(self.data)-context_len, stride):
            input_seq = self.data[i:i+context_len]
            output_seq = self.data[i+1:i+context_len+1]
            self.inputs.append(torch.tensor(input_seq))
            self.outputs.append(torch.tensor(output_seq))
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

In [12]:
def create_dataloader(text,max_length=256, stride=128,
                      batch_size=4,num_workers=0,shuffle=True,drop_last=True): 
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = LLMdataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    return dataloader

In [13]:
dataloader = create_dataloader(text,batch_size=1,max_length= 4,stride=1,shuffle=False,drop_last=False)
data_iter = iter(dataloader)
first=next(data_iter)
print(first)

Data has 5145 tokens
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


<b>Create Token Embeddings


In [14]:
inputs=torch.tensor([0,  4,  1,  2,3])
vocab_size = 10
dimension = 8
embedding = torch.nn.Embedding(vocab_size, dimension)
print(embedding.weight)



Parameter containing:
tensor([[-0.2965,  1.2780,  2.4283, -1.4127,  0.5424, -1.0617, -0.0074,  0.5978],
        [-0.9931, -1.1884, -0.5199,  0.5980, -0.1496,  2.6983, -0.3765, -0.1157],
        [-1.5002, -0.5749,  0.0181,  0.7397,  0.5949,  0.1703,  1.1525,  0.5213],
        [-0.2541,  0.8859,  0.7441, -0.1554, -0.3047,  0.2454, -0.2689, -0.6790],
        [ 0.4554, -1.1907,  0.1251, -1.1796,  1.2104, -2.0776,  0.4003,  0.6311],
        [ 1.0912, -1.9986, -1.4711, -0.7748, -1.7422,  0.8521, -0.5340,  1.3872],
        [ 0.4734,  1.2624,  0.3709,  1.0673,  1.5333, -1.4959,  0.0737,  0.3684],
        [-1.0046,  1.1609,  1.0909,  1.3772, -0.9053,  0.4834, -1.0634,  0.6178],
        [ 0.9583, -0.5808, -1.0443, -0.9404,  0.3085,  0.3809,  0.8629, -0.9471],
        [ 0.3132, -0.3409, -0.3210, -0.8398, -0.4127,  0.4744, -0.7856,  2.3760]],
       requires_grad=True)


In [15]:
print(embedding(torch.tensor([3])))
print(embedding(torch.tensor(inputs)))

tensor([[-0.2541,  0.8859,  0.7441, -0.1554, -0.3047,  0.2454, -0.2689, -0.6790]],
       grad_fn=<EmbeddingBackward0>)
tensor([[-0.2965,  1.2780,  2.4283, -1.4127,  0.5424, -1.0617, -0.0074,  0.5978],
        [ 0.4554, -1.1907,  0.1251, -1.1796,  1.2104, -2.0776,  0.4003,  0.6311],
        [-0.9931, -1.1884, -0.5199,  0.5980, -0.1496,  2.6983, -0.3765, -0.1157],
        [-1.5002, -0.5749,  0.0181,  0.7397,  0.5949,  0.1703,  1.1525,  0.5213],
        [-0.2541,  0.8859,  0.7441, -0.1554, -0.3047,  0.2454, -0.2689, -0.6790]],
       grad_fn=<EmbeddingBackward0>)


  print(embedding(torch.tensor(inputs)))


Positional embedding

In [16]:
vocab_size = 50257
dimension = 768
embedding = torch.nn.Embedding(vocab_size, dimension)

In [18]:
dataloader=create_dataloader(text,batch_size=8,max_length= 4,stride=4,shuffle=False,drop_last=False)
data_iter = iter(dataloader)
inputs,targets=next(data_iter)
print(inputs)


Data has 5145 tokens
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [19]:
embeddings = embedding(inputs)
print(embeddings.shape)

torch.Size([8, 4, 768])


In [23]:
positional_embedding = torch.nn.Embedding(4, dimension)
print(positional_embedding.weight)
pos = positional_embedding(torch.arange(4))
print(pos.shape)
print(pos)

Parameter containing:
tensor([[ 0.7695,  1.6696,  1.1275,  ...,  0.5413,  1.1727, -0.4522],
        [ 0.1084,  1.2868,  0.4365,  ..., -1.6991, -0.4487, -0.3466],
        [-1.6368, -0.3985, -0.5356,  ...,  0.5510, -0.1162,  1.5623],
        [ 0.7553,  0.2174,  1.2904,  ..., -0.1918, -0.4561, -0.7560]],
       requires_grad=True)
torch.Size([4, 768])
tensor([[ 0.7695,  1.6696,  1.1275,  ...,  0.5413,  1.1727, -0.4522],
        [ 0.1084,  1.2868,  0.4365,  ..., -1.6991, -0.4487, -0.3466],
        [-1.6368, -0.3985, -0.5356,  ...,  0.5510, -0.1162,  1.5623],
        [ 0.7553,  0.2174,  1.2904,  ..., -0.1918, -0.4561, -0.7560]],
       grad_fn=<EmbeddingBackward0>)


In [25]:
input_embed = embeddings + pos
print(input_embed.shape)
print(input_embed)

torch.Size([8, 4, 768])
tensor([[[-0.0725,  1.7926,  2.1710,  ...,  0.6457,  1.1043, -1.2078],
         [ 2.2568,  0.7519, -0.8106,  ..., -3.7642, -0.4602,  2.6354],
         [-4.1703, -1.0925,  1.3008,  ...,  1.0022, -1.8426,  0.2147],
         [-0.4530,  0.3145,  3.0678,  ...,  0.2089, -1.5309, -2.3279]],

        [[ 0.0811,  1.3332,  0.5077,  ...,  1.8239,  1.4285, -1.1819],
         [ 0.8817,  0.2406,  2.1487,  ..., -1.4660,  0.4997,  0.5951],
         [-2.2158,  0.4552, -0.3413,  ...,  2.3552, -0.4816,  2.0598],
         [-0.1213, -0.4268,  2.0248,  ..., -1.3299, -1.4588,  0.3232]],

        [[-0.0577,  2.1267,  2.3452,  ..., -0.5126,  0.9244,  1.9218],
         [-0.8555,  2.0447,  0.2403,  ..., -1.0716, -1.6743, -0.5790],
         [-2.5309,  2.0089, -0.3707,  ..., -1.0829, -1.2492,  0.9279],
         [-1.2705,  2.3691,  1.6905,  ...,  0.6511, -0.5430, -2.0906]],

        ...,

        [[ 1.3923,  1.6801,  0.4625,  ...,  1.6814,  2.2632,  0.0963],
         [ 0.1182,  1.2817,  0.91