In [4]:
#Load raw text we want to work with
#The Verdict by Edith Wharton is a public domain short story
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader



In [3]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m18.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [7]:
with open("/content/Urdu.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 1393
میرے چلتے پیچھے پیچھے ایک دن، ایک چھوٹے سے گاؤں میں، ایک لڑکا نے اپنے دوستوں کے ساتھ ایک ماحولیاتی 


In [9]:
#The following regular expression will split on whitespaces
import re
text = "میرے چلتے پیچھے پیچھے ایک دن، ایک چھوٹے سے گاؤں میں، ایک لڑکا نے اپنے دوستوں کے ساتھ ایک ماحولیاتی"
result = re.split(r'(\s)', text)

print(result)

['میرے', ' ', 'چلتے', ' ', 'پیچھے', ' ', 'پیچھے', ' ', 'ایک', ' ', 'دن،', ' ', 'ایک', ' ', 'چھوٹے', ' ', 'سے', ' ', 'گاؤں', ' ', 'میں،', ' ', 'ایک', ' ', 'لڑکا', ' ', 'نے', ' ', 'اپنے', ' ', 'دوستوں', ' ', 'کے', ' ', 'ساتھ', ' ', 'ایک', ' ', 'ماحولیاتی']


In [10]:
#We don't only want to split on whitespaces but also commas and periods, so let's modify the regular expression to do that as well

result = re.split(r'([,.]|\s)', text)

print(result)

['میرے', ' ', 'چلتے', ' ', 'پیچھے', ' ', 'پیچھے', ' ', 'ایک', ' ', 'دن،', ' ', 'ایک', ' ', 'چھوٹے', ' ', 'سے', ' ', 'گاؤں', ' ', 'میں،', ' ', 'ایک', ' ', 'لڑکا', ' ', 'نے', ' ', 'اپنے', ' ', 'دوستوں', ' ', 'کے', ' ', 'ساتھ', ' ', 'ایک', ' ', 'ماحولیاتی']


In [12]:
#let's also handle other types of punctuation, such as periods, question marks, and so on
text = "ہیلو!، دنیا۔ ہاں، یہ ایک ٹیسٹ ہے?۔"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['ہیلو', '!', '،', 'دنیا۔', 'ہاں،', 'یہ', 'ایک', 'ٹیسٹ', 'ہے', '?', '۔']


In [13]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['میرے', 'چلتے', 'پیچھے', 'پیچھے', 'ایک', 'دن،', 'ایک', 'چھوٹے', 'سے', 'گاؤں', 'میں،', 'ایک', 'لڑکا', 'نے', 'اپنے', 'دوستوں', 'کے', 'ساتھ', 'ایک', 'ماحولیاتی', 'سیاحت', 'کی', 'خواب', 'میں', 'بھرپور', 'جویا۔', 'گاؤں', 'کے', 'باہر', 'جنگلاتی']


In [14]:
#Let's calculate the total number of tokens
print(len(preprocessed))

307


In [15]:
#From these tokens, we can now build a vocabulary that consists of all the unique tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

140


In [16]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [37]:
#Below are the first 50 entries in this vocabulary:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 140:
        break

('آخری', 0)
('اب', 1)
('احترام', 2)
('احساس', 3)
('احساسات', 4)
('اس', 5)
('ان', 6)
('انہوں', 7)
('اور', 8)
('اُن', 9)
('اُنہوں', 10)
('اُٹھایا۔', 11)
('اپنے', 12)
('ایک', 13)
('بابون', 14)
('باہر', 15)
('بعد،', 16)
('بنائی', 17)
('بڑھ', 18)
('بڑھتی', 19)
('بھرپور', 20)
('بہتر', 21)
('تو', 22)
('تک', 23)
('تھا', 24)
('تھا،', 25)
('تھا۔', 26)
('تھی', 27)
('تھی۔', 28)
('جا', 29)
('جاری', 30)
('جاننے', 31)
('جانوروں', 32)
('جب', 33)
('جذبہ', 34)
('جنگل', 35)
('جنگلاتی', 36)
('جو', 37)
('جویا۔', 38)
('جگہوں', 39)
('حریم', 40)
('خواب', 41)
('خواہش', 42)
('خوبصورت', 43)
('خوبصورتی', 44)
('خوشی', 45)
('دعویٰ', 46)
('دل', 47)
('دن،', 48)
('دوستوں', 49)
('دیا،', 50)
('دینے', 51)
('دیکھا', 52)
('دیکھتے', 53)
('دیکھیں', 54)
('راستہ', 55)
('رخ', 56)
('رکھتے', 57)
('رہے۔', 58)
('زمانے', 59)
('زندگی', 60)
('ساتھ', 61)
('سفر', 62)
('سمجھایا', 63)
('سکتے،', 64)
('سیاحت', 65)
('سے', 66)
('شروع', 67)
('شریک', 68)
('طرف', 69)
('طریقے', 70)
('عجیب', 71)
('علاقے', 72)
('فیصلہ', 73)
('قائم', 74)
('قدیم', 75

In [25]:
import re

class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.findall(r'\S+|\n', text)
        ids = [self.str_to_int[s] for s in preprocessed if s in self.str_to_int]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text




In [26]:
tokenizer = SimpleTokenizerV1(vocab)

text = "اور وہ اب اپنے گاوں کو اور اس کے لوگوں کو اور بہتر طریقے سے جاننے کی خواہش رکھتے ہیں"
ids = tokenizer.encode(text)
print(ids)

[8, 105, 1, 12, 132, 122, 8, 5, 128, 80, 122, 8, 21, 70, 66, 31, 125, 42, 57]


In [27]:
#We can use the tokenizer to encode (that is, tokenize) texts into integers
#These integers can then be embedded (later) as input of/for the LLM

tokenizer = SimpleTokenizerV1(vocab)

text = """"اور وہ اب اپنے گاوں کو اور اس کے لوگوں کو اور بہتر طریقے سے جاننے کی خواہش رکھتے ہیں"""
ids = tokenizer.encode(text)
print(ids)

[105, 1, 12, 132, 122, 8, 5, 128, 80, 122, 8, 21, 70, 66, 31, 125, 42, 57]


In [28]:
#We can decode the integers back into text

tokenizer.decode(ids)

'وہ اب اپنے گاوں کو اور اس کے لوگوں کو اور بہتر طریقے سے جاننے کی خواہش رکھتے'

In [29]:
tokenizer.decode(tokenizer.encode(text))

'وہ اب اپنے گاوں کو اور اس کے لوگوں کو اور بہتر طریقے سے جاننے کی خواہش رکھتے'

GPT-2 does not use an <UNK> token for out-of-vocabulary words; instead, GPT-2 uses a byte-pair encoding (BPE) tokenizer, which breaks down words into subword units.


In [33]:
tokenizer = SimpleTokenizerV1(vocab)

text = "کھیلنے"

tokenizer.encode(text)
for i in range(120, 130):
    print(tokenizer.decode([i]))

کر،
کرنے
کو
کھیلنے
کہ
کی
کیا،
کیا۔
کے
گئی


(1)The above produces an error because the word "Hello" is not contained in the vocabulary
(2)To deal with such cases, we can add special tokens like "<|unk|>" to the vocabulary to represent unknown words
(3)let's add another token called "<|endoftext|>" which is used in GPT-2 training to denote the end of a text (and it's also used between concatenated text, like if our training datasets consists of multiple articles, books, etc.)

In [34]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [35]:
len(vocab.items())

142

In [38]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('ہوتے', 137)
('ہیں۔', 138)
('ہے۔', 139)
('<|endoftext|>', 140)
('<|unk|>', 141)


We also need to adjust the tokenizer accordingly so that it knows when and how to use the new <unk> token

In [39]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


Let's try to tokenize text with the modified tokenizer:

In [45]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "اُن کے دوستوں نے اُن کو سمجھایا کہ وہ جانوروں کی حریم میں متشددی سے نہیں جا سکتے، تو اُنہوں نے معافی مانگی اور گاؤں کی طرف رخ کیا۔؟"
text2 = "محل کی روشنی دار تراسوں میں"

text = " <|endoftext|> ".join((text1, text2))

print(text)

اُن کے دوستوں نے اُن کو سمجھایا کہ وہ جانوروں کی حریم میں متشددی سے نہیں جا سکتے، تو اُنہوں نے معافی مانگی اور گاؤں کی طرف رخ کیا۔؟ <|endoftext|> محل کی روشنی دار تراسوں میں۔


In [46]:
tokenizer.encode(text)

[9,
 128,
 49,
 104,
 9,
 122,
 63,
 124,
 105,
 32,
 125,
 40,
 100,
 87,
 66,
 102,
 29,
 64,
 22,
 10,
 104,
 94,
 86,
 8,
 131,
 125,
 69,
 56,
 141,
 140,
 141,
 125,
 141,
 141,
 141,
 141]

In [47]:
tokenizer.decode(tokenizer.encode(text))

'اُن کے دوستوں نے اُن کو سمجھایا کہ وہ جانوروں کی حریم میں متشددی سے نہیں جا سکتے، تو اُنہوں نے معافی مانگی اور گاؤں کی طرف رخ <|unk|> <|endoftext|> <|unk|> کی <|unk|> <|unk|> <|unk|> <|unk|>'

In [48]:

# pip install tiktoken
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.7.0


In [49]:
tokenizer = tiktoken.get_encoding("gpt2")

In [52]:

text = "اُن کے دوستوں نے اُن کو سمجھایا کہ وہ جانوروں کی حریم میں متشددی سے نہیں جا سکتے، تو اُنہوں نے معافی مانگی اور گاؤں کی طرف رخ کیا۔ <|endoftext|>محل کی روشنی دار تراسوں میں."

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[12919, 149, 237, 23338, 220, 150, 102, 151, 240, 17550, 107, 30335, 45692, 41486, 30335, 150, 118, 18923, 228, 151, 240, 220, 12919, 149, 237, 23338, 220, 150, 102, 30335, 17550, 111, 25405, 148, 105, 150, 122, 12919, 151, 234, 12919, 220, 150, 102, 151, 223, 42092, 151, 223, 17550, 105, 12919, 23338, 30335, 26897, 30335, 150, 118, 220, 150, 102, 151, 234, 17550, 255, 26897, 151, 234, 25405, 47048, 151, 234, 150, 118, 47048, 41486, 148, 112, 38843, 38843, 151, 234, 17550, 111, 151, 240, 18923, 228, 151, 223, 151, 234, 150, 118, 17550, 105, 12919, 17550, 111, 150, 102, 41486, 151, 240, 148, 234, 17550, 103, 30335, 220, 12919, 149, 237, 23338, 151, 223, 30335, 150, 118, 18923, 228, 151, 240, 47048, 44690, 12919, 149, 223, 151, 234, 47048, 12919, 23338, 150, 107, 151, 234, 220, 12919, 30335, 26897, 220, 150, 107, 34247, 97, 150, 118, 220, 150, 102, 151, 234, 17550, 115, 26897, 149, 223, 17550, 109, 148, 106, 220, 150, 102, 151, 234, 12919, 151, 242, 220, 50256, 25405, 148, 255, 13862, 22

In [53]:

strings = tokenizer.decode(integers)

print(strings)

اُن کے دوستوں نے اُن کو سمجھایا کہ وہ جانوروں کی حریم میں متشددی سے نہیں جا سکتے، تو اُنہوں نے معافی مانگی اور گاؤں کی طرف رخ کیا۔ <|endoftext|>محل کی روشنی دار تراسوں میں.


In [55]:
with open("/content/Urdu.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

1813


For each text chunk, we want the inputs and targets
Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right

In [75]:

enc_sample = enc_text[130:]

In [76]:

context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [30335, 13862, 151, 234]
y:      [13862, 151, 234, 34247]


One by one, the prediction would look like as follows:

In [77]:

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[30335] ----> 13862
[30335, 13862] ----> 151
[30335, 13862, 151] ----> 234
[30335, 13862, 151, 234] ----> 34247


In [80]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

و ----> ل
ول ----> �
ول� ----> �
ولی ----> ا�


In [60]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.3.0+cu121


Create dataset and dataloader that extract chunks from the input text dataset

In [81]:
from torch.utils.data import Dataset, DataLoader

In [82]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [83]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

Let's test the dataloader with a batch size of 1 for an LLM with a context size of 4:

In [84]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[25405,   151,   234, 26897]]), tensor([[  151,   234, 26897,   151]])]


In [85]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  151,   234, 26897,   151]]), tensor([[  234, 26897,   151,   240]])]


we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting

In [86]:

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[25405,   151,   234, 26897],
        [  151,   240,   220,   150],
        [  228, 13862, 41486,   151],
        [  240, 18923,   122,   151],
        [  234,   150,   228,   150],
        [  122,   151,   240, 18923],
        [  122,   151,   234,   150],
        [  228,   150,   122,   151]])

Targets:
 tensor([[  151,   234, 26897,   151],
        [  240,   220,   150,   228],
        [13862, 41486,   151,   240],
        [18923,   122,   151,   234],
        [  150,   228,   150,   122],
        [  151,   240, 18923,   122],
        [  151,   234,   150,   228],
        [  150,   122,   151,   240]])


The data is already almost ready for an LLM
But lastly let us embed the tokens in a continuous vector representation using an embedding layer
Usually, these embedding layers are part of the LLM itself and are updated (trained) during model training

In [87]:
#Suppose we have the following four input examples with input ids 5, 1, 3, and 2 (after tokenization):
input_ids = torch.tensor([2, 3, 5, 1])

For the sake of simplicity, suppose we have a small vocabulary of only 6 words and we want to create embeddings of size 3:

In [88]:
#This would result in a 6x3 weight matrix:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [89]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [90]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [92]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


The BytePair encoder has a vocabulary size of 50,257:
Suppose we want to encode the input tokens into a 256-dimensional vector representation:

In [93]:

vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector
If we have a batch size of 8 with 4 tokens each, this results in a 8 x 4 x 256 tensor:

In [94]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [95]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[25405,   151,   234, 26897],
        [  151,   240,   220,   150],
        [  228, 13862, 41486,   151],
        [  240, 18923,   122,   151],
        [  234,   150,   228,   150],
        [  122,   151,   240, 18923],
        [  122,   151,   234,   150],
        [  228,   150,   122,   151]])

Inputs shape:
 torch.Size([8, 4])


In [96]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


GPT-2 uses absolute position embeddings, so we just create another embedding layer:

In [97]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [98]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:

In [99]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
