#Sub_word_tokenizers
Copyright 2023, Denis Rothman

Sub word tokenizers other than BPE and Wordpiece

In [3]:
!pip install transformers -qq

In [7]:
!pip install sentencepiece -qq

# Unigram Language Model Tokenization

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace

# Define a sample corpus
corpus = [
    "Subword tokenizers break text sequences into subwords.",
    "This sentence is another part of the corpus.",
    "Tokenization is the process of breaking text down into smaller units.",
    "These smaller units can be words, subwords, or even individual characters.",
    "Transformer models often use subword tokenization."
]

# Instantiate a Unigram tokenizer model
tokenizer = Tokenizer(Unigram([]))

# Add a pre-tokenizer
tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer model
trainer = UnigramTrainer(vocab_size=5000)  # Here you set the desired vocabulary size
tokenizer.train_from_iterator(corpus, trainer)

# Now let's tokenize the original sentence
output = tokenizer.encode("Subword tokenizers break text sequences into subwords.")
print(output.tokens)

['S', 'ubword', 'tokeniz', 'er', 's', 'break', 'te', 'x', 't', 'se', 'q', 'u', 'ence', 's', 'in', 'to', 'subword', 's', '.']


# SentencePiece tokenization

In [13]:
import sentencepiece as spm
import random

# Define a basic corpus
basic_corpus = [
    "Subword tokenizers break text sequences into subwords.",
    "This sentence is another part of the corpus.",
    "Tokenization is the process of breaking text down into smaller units.",
    "These smaller units can be words, subwords, or even individual characters.",
    "Transformer models often use subword tokenization."
]

# Generate a larger corpus by repeating sentences from the basic corpus
corpus = [random.choice(basic_corpus) for _ in range(10000)]

# Write the corpus to a text file
with open('large_corpus.txt', 'w') as f:
    for sentence in corpus:
        f.write(sentence + '\n')

# Train the SentencePiece model
spm.SentencePieceTrainer.train(input='large_corpus.txt', model_prefix='m', vocab_size=88)

# Load the trained model
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# Tokenize the original sentence
tokens = sp.encode_as_pieces("Subword tokenizers break text sequences into subwords.")
print(tokens)

['▁', 'S', 'ubword', '▁tokeniz', 'ers', '▁break', '▁', 'te', 'x', 't', '▁se', 'q', 'u', 'ence', 's', '▁in', 'to', '▁subwords', '.']
