In [7]:
sample_text = """The old wizard lived in a tall tower. Every morning he would wake up early and look out his window. 
    From his window he could see the entire village below. The village was small but busy. 
    People walked through the streets carrying baskets. The baskets were filled with fresh bread and fruit.
    
    One day the wizard noticed something strange. A large dragon was flying toward the village. 
    The dragon was enormous and had bright red scales. The wizard knew he had to act quickly.
    He grabbed his magic wand from the wooden table. The wand was old but very powerful.
    
    The wizard pointed the wand at the dragon and spoke a magic spell. The spell created a bright light.
    The light surrounded the dragon and made it disappear. The village was safe once again.
    The people in the village cheered and thanked the brave wizard.
    
    After the adventure the wizard returned to his tower. He was tired but happy. 
    He had protected the village and its people. The wizard knew that tomorrow might bring new challenges.
    But for now he could rest peacefully in his tall tower."""

print(len(sample_text)) # Total characters
print(sample_text[:99]) # 1st 100 characters

1098
The old wizard lived in a tall tower. Every morning he would wake up early and look out his window.


In [8]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text) #Splitting on space

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [9]:
result = re.split(r'([,.]|\s)', text) #Splitting on space and ,.
print(result) ## Contains whitespace chars

# To remove them
result = [item for item in result if item.strip()]
print(result)

# When developing a simple tokenizer, whether we should encode whitespaces as
# separate characters or just remove them depends on our application and its
# requirements. Removing whitespaces reduces the memory and computing
# requirements. However, keeping whitespaces can be useful if we train models that
# are sensitive to the exact structure of the text (for example, Python code, which is
# sensitive to indentation and spacing).

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [10]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text) ## Handling tokens for more symbols with ,.
result = [item.strip() for item in result if item.strip()] ##Remove any whitespace around the token
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [None]:
## Applying this tokenizer to the sample text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', sample_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(len(preprocessed)) ## The text in form of tokens

['The', 'old', 'wizard', 'lived', 'in', 'a', 'tall', 'tower', '.', 'Every', 'morning', 'he', 'would', 'wake', 'up', 'early', 'and', 'look', 'out', 'his', 'window', '.', 'From', 'his', 'window', 'he', 'could', 'see', 'the', 'entire']
209


In [None]:
## Converting tokens to IDs
all_words = sorted(set(preprocessed)) ## Vocabulary is sorted list (set) of all unique tokens
vocab_size = len(all_words)

print(vocab_size) ## The number of unique tokens
print(all_words[:5])

107
['.', 'A', 'After', 'But', 'Every']


In [None]:
## We assign every unique token a unique integer called token ID
vocab = {token:integer for integer,token in enumerate(all_words)} ## Vocab is a dictionary of all unique tokens and their corresponding token IDs
## Encoding the tokens to IDs for input to LLM, need a decoder to convert token IDs back to tokens after LLM outputs
print(vocab['The'])
print(vocab['he'])

9
44


In [18]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 15:
        break

('.', 0)
('A', 1)
('After', 2)
('But', 3)
('Every', 4)
('From', 5)
('He', 6)
('One', 7)
('People', 8)
('The', 9)
('a', 10)
('act', 11)
('adventure', 12)
('again', 13)
('and', 14)
('at', 15)


In [None]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab ## Token to ID mapping
        self.int_to_str = {i:s for s,i in vocab.items()} ## ID to token mapping for decoder
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text) ## Split the text into tokens
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip() ## Remove whitespaces from the tokens
        ]
        ids = [self.str_to_int[s] for s in preprocessed] ## Using the vocab we generated, assign token ID for each token in the text
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) ## ID to token and joined with a space
        
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) ## Replace spaces before the specified punctuations like .,?
        return text

In [20]:
tokenizer = SimpleTokenizer(vocab)
tokenizer

<__main__.SimpleTokenizer at 0x106a813d0>

In [22]:
text = """One day the wizard noticed something strange. A large dragon was flying toward the village."""
ids = tokenizer.encode(text)
print(ids)

print(tokenizer.decode(ids))


[7, 29, 87, 104, 59, 77, 80, 0, 1, 50, 31, 100, 36, 92, 87, 96, 0]
One day the wizard noticed something strange. A large dragon was flying toward the village.


In [None]:
## For words not in the vocab

text = "Hello, do you like tea?"
print(tokenizer.encode(text))

## Hence, we need a large and diverse vocab to train our LLM

KeyError: 'Hello'

In [30]:
## Adding special context tokens to handle such situations
# <unk> to represent words that are not in the vocab
# <endOfText> added at the end of different text sources to separate them (like one from wikeipedia article, one linkedin post). Helps LLM have better understanding of text

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"]) ## Add these 2 tokens to the list of all unique tokens

vocab = {token:integer for integer,token in enumerate(all_tokens)} ## Create the vocab
print(vocab['<|unk|>'])
print(vocab['<|endoftext|>'])
print(len(vocab))

108
107
109


In [31]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]       
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed ## If the token is not in the vocab, we replace it with unk token
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [32]:
tokenizer = SimpleTokenizerV2(vocab)
tokenizer

<__main__.SimpleTokenizerV2 at 0x106e192b0>

In [37]:
text1 = "Hello, does the wizard like tea?"
text2 = "He had protected the village and its people."
text = " <|endoftext|> ".join((text1, text2))
print(text)

ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

Hello, does the wizard like tea? <|endoftext|> He had protected the village and its people.
[108, 108, 108, 87, 104, 108, 108, 108, 107, 6, 42, 68, 87, 96, 14, 48, 65, 0]
<|unk|> <|unk|> <|unk|> the wizard <|unk|> <|unk|> <|unk|> <|endoftext|> He had protected the village and its people.


In [None]:
## There is also BOS (beginning of sequence) token, EOS (End of text) and PAD token, however GPT only uses endoftext token rather than these. It uses BPE instead of using unk token