## Training SOTA tokenizer models using HuggingFace `tokenizers` package


1. Byte Pair Encoding (BPE) Algorithm
2. WordPiece Algorithm

In [22]:
!pip install tokenizers



## Importing packages

In [23]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace


#### Download the data to train the model.

In [24]:
!wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt

--2022-03-15 16:01:16--  http://www.gutenberg.org/cache/epub/16457/pg16457.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/16457/pg16457.txt [following]
--2022-03-15 16:01:17--  https://www.gutenberg.org/cache/epub/16457/pg16457.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 617622 (603K) [text/plain]
Saving to: ‘pg16457.txt.2’


2022-03-15 16:01:17 (6.90 MB/s) - ‘pg16457.txt.2’ saved [617622/617622]



In [25]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2022-03-15 16:01:20--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.107.126
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.107.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip.2’


2022-03-15 16:01:25 (37.8 MB/s) - ‘wikitext-103-raw-v1.zip.2’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip
replace wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.test.raw  
replace wikitext-103-raw/wiki.valid.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.valid.raw  
replace wikitext-103-raw/wiki.train.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.train.raw  


## Define the 3-step process

In [26]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # special tokens

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output


## Training each model on gutenberg and wikitext dataset

In [27]:
gutenberg_dataset = ['pg16457.txt']
wikitext_dataset = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokens_dict = {}

for files in [gutenberg_dataset, wikitext_dataset]:
    print(f"========Using vocabulary from corpus {files}=======")
    for alg in ['BPE', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("Using ", alg, " Algorithm")
        print(output.tokens, "-> length of tokens :", len(output.tokens))


Using  BPE  Algorithm
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 't', 'ut', 'or', 'ial', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.', 'Ex', 'c', 'ited', 'much', '?', '!', '<UNK>'] -> length of tokens : 55
Using  WPC  Algorithm
['This', 'is', 'a', 'deep', 'learning', 'to', '##ken', '##ization', 't', '##ut', '##oria', '##l', '.', 'To', '##ken', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', '##L', '##P', 'pip', '##el', '##ine', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', '##ken', '##s', 'generated', 'by', 'each', 'to', '##ken', '##ization', 'model', '.', 'Ex', '##ci', '##ted', 'much', '<UNK>'] -> length of tokens : 52
Using  BPE  Algorithm
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 'tut', 'orial', '.

## Comparing the BPE and Unigram tokens

In [28]:

tokens_dict = {}

for alg in ['BPE', 'UNI', 'WPC']:
    trained_tokenizer = train_tokenizer(wikitext_dataset, alg)
    input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
    output = tokenize(input_string, trained_tokenizer)
    tokens_dict[alg] = output.tokens

In [29]:
import pandas as pd

max_len = max(len(tokens_dict['UNI']), len(tokens_dict['WPC']), len(tokens_dict['BPE']))
diff_bpe = max_len - len(tokens_dict['BPE'])
diff_wpc = max_len - len(tokens_dict['WPC'])

tokens_dict['BPE'] = tokens_dict['BPE'] + ['<PAD>']*diff_bpe
tokens_dict['WPC'] = tokens_dict['WPC'] + ['<PAD>']*diff_wpc



df = pd.DataFrame(tokens_dict)

In [30]:
df.head(47)

Unnamed: 0,BPE,UNI,WPC
0,This,This,This
1,is,i,is
2,a,s,a
3,deep,a,deep
4,learning,deep,learning
5,to,learn,to
6,ken,ing,##ken
7,ization,t,##ization
8,tut,o,tut
9,orial,ken,##orial


In [31]:
df.describe(include= 'all')

Unnamed: 0,BPE,UNI,WPC
count,68,68,68
unique,37,41,37
top,<PAD>,o,<PAD>
freq,21,5,20


In [32]:
set(df['UNI']) - set(df['BPE'])

{'L',
 'N',
 'T',
 'W',
 'com',
 'd',
 'e',
 'generate',
 'i',
 'ing',
 'learn',
 'line',
 'o',
 'p',
 'par',
 'rial',
 's',
 't',
 'u',
 '😍'}

In [33]:
set(df['UNI']) - set(df['WPC'])

{'!',
 '?',
 'Ex',
 'L',
 'N',
 'P',
 'T',
 'W',
 'cited',
 'com',
 'd',
 'e',
 'generate',
 'i',
 'ing',
 'ization',
 'ken',
 'learn',
 'line',
 'o',
 'p',
 'par',
 'rial',
 's',
 't',
 'u',
 '😍'}

In [34]:
set(df['WPC']) - set(df['UNI'])


{'##P',
 '##eni',
 '##ited',
 '##ization',
 '##ken',
 '##on',
 '##orial',
 '##s',
 '##ti',
 '##za',
 '<PAD>',
 '<UNK>',
 'Exc',
 'NL',
 'Tok',
 'We',
 'comparing',
 'generated',
 'is',
 'learning',
 'pipeline',
 'to',
 'tut'}