In [None]:
## Problems with word-based tokenization:
# 1. What do we do with Out of Vocab (OOV) words
# 2. Different meaning of similar words ex: boy, boys are treated as separate tokens

In [None]:
## Problems with character-based tokenization:
# 1. Has a very small vocab, so solves the OOV problem
# 2. Meaning associated with words is completely lost
# 3. Tokenized sequence of IDs is much longer then the original text

In [None]:
## BPE is Sub-word tokenization
# Rule 1: Do not split frequently used words into smaller sub-words and represent them as a single token
# Rule 2: Split rare words into smaller, meaningful sub-words (can even broken down to char level for very rare words)
# 'boy' will not be split but 'boys' will be split into ['boy', 's']
# Advantages:
# 1. Helps model learn that different words with same root word are similar in meaning
# 2. Words like 'tokenization' and 'modernization' have the same suffix 'ization' means the model understands they are used in same syntactic situations

In [None]:
## BPE is traditional compression algo where the most common pair of consecutive bytes is replaced with a byte that doesnt occur in the data. Same is done for the tokens
# Start from character level tokens(and a special </w> token marking end of a word) and their counts and then combine most frequent occuring token pairs to create a new combined token and remove tokens when their count gets 0. This is done for fixed number of iterations or till we reach a desired number of tokens

# https://www.youtube.com/watch?v=fKd8s29e-l4&list=PLPTV0NXA_ZSgsLAr8YCgCwhPIJNNtexWu&index=9

In [3]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [4]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer

<Encoding 'gpt2'>

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

# someunknownPlace will have been <unk> in word level, but in sub-word this is captured accuartely

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [6]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [7]:
## For unknown tokens
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier
