<!-- Reads first 100 characters -->

In [2]:
with open(r"D:\Machine Learning\LLM from Scratch\the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
print("Total numbers of character:",len(raw_text))
print(raw_text[:99])

Total numbers of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


<!-- #Our Goal is to tokenize these 20479 character short story into indivisual words and special characters that we can then turn them into embeddings for LLM trining -->

In [3]:
import re

text = "Hello World. This, is a test."
result = re.split(r'(\s)',text) # (\s) to skip the witespaces

print(result)

['Hello', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


<!-- We will also modify the reg expression splits on whitespaces(\s) and commas and periods  -->

In [4]:
result = re.split(r'([,.]|\s)',text)

print(result)

['Hello', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


<!-- A small remaining issue is that the list still includes whitespace characters. Optionally, we
can remove these redundant characters safely as follows: -->

In [5]:
result = [item for item in result if item.strip()]
print(result)

['Hello', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']


<!-- The tokenization scheme we devised above works well on the simple sample text. Let's
modify it a bit further so that it can also handle other types of punctuation, such as
question marks, quotation marks, and the double-dashes we have seen earlier in the first
100 characters of Edith Wharton's short story, along with additional special characters: -->

In [6]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'(--|[,.:?_!"()\[\]\s])', text)
result = [item.split() for item in result if item.strip()]
print(result)

[['Hello'], [','], ['world'], ['.'], ['Is'], ['this'], ['--'], ['a'], ['test'], ['?']]


<!-- Now we will apply the above tokenizer we built to preprocess the book we had in raw_text -->

In [17]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.split()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
print(len(preprocessed)) 

4690


<!--Step 2 -> Create the Token ID's -->

In [9]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


<!-- After determining the vocabulary size is 1130 we create vocabulary and assign Token ID -->

In [10]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [11]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


<!-- when we want to convert the outputs of an LLM from numbers back into
text we also need a way to turn token IDs into text.For this we can create an inverse
version of the vocabulary that maps token IDs back to corresponding text tokens -->

In [12]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        #Replace spaces before specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text

<!-- Let's instantiate a new tokenizer object from the SimpleTokenizerV1 class and tokenize a text or a passage -->

In [13]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)

print(ids)


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [14]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

<!-- The problem is that the word Hello was not used in the The Verdict short story
Hence it is not contained in the vocabulary
This highlights the need to consider large and diverse training sets to extend the vocabulary when working on LLMs

 -->

In [15]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

<!--We will modify this tokenizer to handle unknown
words.
In particular, we will modify the vocabulary and tokenizer we implemented in the
previous section, SimpleTokenizerV2, to support two new tokens, <|unk|> and
<|endoftext|> -->

In [18]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["|endoftext|","|unk|"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [19]:
len(vocab.items())

1132

#Printing the last two Entries

In [21]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed = preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        preprocessed = [
            item if item in self.str_to_int
            else "|unk|" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        #Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text

In [23]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit morning terraces of the palace"

text = "<|endoftext|>".join([text1,text2])

print(text)

Hello, do you like tea?<|endoftext|>In the sunlit morning terraces of the palace


In [25]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1131, 988, 956, 1131, 984, 722, 988, 1131]

#[BOS] (beginning of sequence): This token marks the start of a text. It
signifies to the LLM where a piece of content begins.

#[EOS] (end of sequence): This token is positioned at the end of a text,
and is especially useful when concatenating multiple unrelated texts,
similar to <|endoftext|>. For instance, when combining two different
Wikipedia articles or books, the [EOS] token indicates where one article
ends and the next one begins.

#[PAD] (padding): When training LLMs with batch sizes larger than one,
the batch might contain texts of varying lengths. To ensure all texts have
the same length, the shorter texts are extended or "padded" using the
[PAD] token, up to the length of the longest text in the batch.

In [None]:
# The tokenizer used for GPT models also doesnt use an <|unk|> token for breakin down words into subword 
# It uses Byte pair encoding not <|unk|>