# WordLevel Tokenizer
* Instantiate the model with a whitespace split pre-tokenizer
* Train the model on the training sentences only (with unknown, classification, separation, pad, and mask special tokens)
* Tokenize the first sentence
* Print out the output tokens
* Print out the output token IDs
* Tokenize "you", "you're", and "you've" to show the effects of the tokenizer on words with shared suffixes

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit

data = [
    "the quick brown fox jumps over the lazy dog",
    "you you're and you've are all different words"
]

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True)

tokenizer.train_from_iterator(data, trainer, length=len(data))

#### Tokenize a sentence identical to a training sample

In [16]:
output = tokenizer.encode("the quick brown fox jumps over the lazy dog")

print(output.tokens)
print(output.ids)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
[5, 16, 9, 12, 13, 15, 5, 14, 11]


#### Tokenize a sentence containing words with shared roots but differing suffixes

In [17]:
output = tokenizer.encode("you you're you've")

print(output.tokens)
print(output.ids)

['you', "you're", "you've"]
[18, 19, 20]


#### Tokenize a sentence with an unknown word (doesn't appear in training data)

In [18]:
output = tokenizer.encode("you'll")

print(output.tokens)
print(output.ids)

['[UNK]']
[0]


# WordPiece Tokenizer
* Instantiate the model with a whitespace split pre-tokenizer
* Train the model on the training sentences only (with unknown, classification, separation, pad, and mask special tokens)
* Tokenize the first sentence
* Print out the output tokens
* Print out the output token IDs
* Tokenize "you", "you're", and "you've" to show the effects of the tokenizer on words with shared suffixes

In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

data = [
    "the quick brown fox jumps over the lazy dog",
    "you you're and you've are all different words"
]

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True)

tokenizer.train_from_iterator(data, trainer, length=len(data))






#### Tokenize a sentence identical to a training sample

In [2]:
output = tokenizer.encode("the quick brown fox jumps over the lazy dog")

print(output.tokens)
print(output.ids)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
[59, 98, 95, 88, 97, 91, 59, 90, 87]


#### Tokenize a sentence containing words with shared roots but differing suffixes

In [3]:
output = tokenizer.encode("you you're you've")

print(output.tokens)
print(output.ids)

['you', 'you', "'", 're', 'you', "'", 've']
[56, 56, 5, 71, 56, 5, 72]


#### Tokenize a sentence with an unknown word (doesn't appear in training data)

In [4]:
output = tokenizer.encode("you'll")

print(output.tokens)
print(output.ids)

['you', "'", 'l', '##l']
[56, 5, 17, 54]


In [5]:
output = tokenizer.encode("anything")

print(output.tokens)
print(output.ids)

['an', '##y', '##t', '##h', '##i', '##n', '##g']
[61, 49, 37, 53, 32, 36, 39]


# Byte Pair Encoding Tokenizer
* Instantiate the model with a whitespace split pre-tokenizer
* Train the model on the training sentences only (with unknown, classification, separation, pad, and mask special tokens)
* Tokenize the first sentence
* Print out the output tokens
* Print out the output token IDs
* Tokenize "you", "you're", and "you've" to show the effects of the tokenizer on words with shared suffixes

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

data = [
    "the quick brown fox jumps over the lazy dog",
    "you you're and you've are all different words"
]

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True)

tokenizer.train_from_iterator(data, trainer, length=len(data))






#### Tokenize a sentence identical to a training sample

In [8]:
output = tokenizer.encode("the quick brown fox jumps over the lazy dog")

print(output.tokens)
print(output.ids)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
[36, 70, 72, 66, 74, 69, 36, 68, 64]


#### Tokenize a sentence containing words with shared roots but differing suffixes

In [10]:
output = tokenizer.encode("you you're you've")

print(output.tokens)
print(output.ids)

['you', 'you', "'", 're', 'you', "'", 've']
[34, 34, 5, 33, 34, 5, 37]


#### Tokenize a sentence with an unknown word (doesn't appear in training data)

In [4]:
output = tokenizer.encode("you'll")

print(output.tokens)
print(output.ids)

['you', "'", 'l', 'l']
[34, 5, 17, 17]


In [11]:
output = tokenizer.encode("anything")

print(output.tokens)
print(output.ids)

['an', 'y', 't', 'h', 'i', 'n', 'g']
[39, 30, 25, 13, 14, 19, 12]


# Byte-level Byte Pair Encoding Tokenizer
* Instantiate the model with a whitespace split pre-tokenizer
* Train the model on the training sentences only (with unknown, classification, separation, pad, and mask special tokens)
* Tokenize the first sentence
* Print out the output tokens
* Print out the output token IDs
* Tokenize "you", "you're", and "you've" to show the effects of the tokenizer on words with shared suffixes

In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

data = [
    "the quick brown fox jumps over the lazy dog",
    "you you're and you've are all different words"
]

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = ByteLevel()

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True)

tokenizer.train_from_iterator(data, trainer, length=len(data))






#### Tokenize a sentence identical to a training sample

In [2]:
output = tokenizer.encode("the quick brown fox jumps over the lazy dog")

print(output.tokens)
print(output.ids)

['Ġthe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']
[42, 86, 81, 82, 83, 85, 42, 84, 77]


#### Tokenize a sentence containing words with shared roots but differing suffixes

In [3]:
output = tokenizer.encode("you you're you've")

print(output.tokens)
print(output.ids)

['Ġyou', 'Ġyou', "'re", 'Ġyou', "'ve"]
[37, 37, 43, 37, 44]


#### Tokenize a sentence with an unknown word (doesn't appear in training data)

In [4]:
output = tokenizer.encode("you'll")

print(output.tokens)
print(output.ids)

['Ġyou', "'", 'll']
[37, 5, 55]


In [5]:
output = tokenizer.encode("anything")

print(output.tokens)
print(output.ids)

['Ġa', 'n', 'y', 't', 'h', 'i', 'n', 'g']
[36, 19, 30, 25, 13, 14, 19, 12]
