## Training SOTA tokenizer models using HuggingFace `tokenizers` package


1. Byte Pair Encoding (BPE) Algorithm
2. WordPiece Algorithm

In [12]:
!pip install tokenizers



## Importing packages

In [13]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace


#### Download the data to train the model.

In [14]:
!wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt

--2022-03-15 16:16:14--  http://www.gutenberg.org/cache/epub/16457/pg16457.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/16457/pg16457.txt [following]
--2022-03-15 16:16:15--  https://www.gutenberg.org/cache/epub/16457/pg16457.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 617622 (603K) [text/plain]
Saving to: ‘pg16457.txt.3’


2022-03-15 16:16:16 (659 KB/s) - ‘pg16457.txt.3’ saved [617622/617622]



In [15]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2022-03-15 16:16:20--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.232.96
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.232.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip.3’


2022-03-15 16:16:36 (12.2 MB/s) - ‘wikitext-103-raw-v1.zip.3’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip
replace wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.test.raw  
replace wikitext-103-raw/wiki.valid.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.valid.raw  
replace wikitext-103-raw/wiki.train.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wikitext-103-raw/wiki.train.raw  


## Define the 3-step process

In [16]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # special tokens

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output


## Convert a list to string

In [7]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 

## Gutenberg Text

In [17]:
with open('pg16457.txt') as f:
    gutenberg_txt = f.readlines()
gutenberg_txt = listToString(gutenberg_txt)

## Wikitext Text

In [18]:
with open("./wikitext-103-raw/wiki.test.raw") as f:
    wikitext_test = f.readlines()
wikitext_test = listToString(wikitext_test)

with open("./wikitext-103-raw/wiki.train.raw") as f:
    wikitext_train = f.readlines()
wikitext_train = listToString(wikitext_train)

with open("./wikitext-103-raw/wiki.valid.raw") as f:
    wikitext_valid = f.readlines()
wikitext_valid = listToString(wikitext_valid)

wikitext_txt = wikitext_test + wikitext_train + wikitext_valid

## Training each model on **gutenberg** **dataset** (Based on gutenberg tokenization)

In [19]:
gutenberg_dataset = ['pg16457.txt']
wikitext_dataset = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokens_dict = {}

for files in [gutenberg_dataset]:
    print(f"========Using vocabulary from corpus {files}=======")
    for alg in ['BPE', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = gutenberg_txt
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("Using ", alg, " Algorithm")
        print(output.tokens, "-> length of tokens :", len(output.tokens))


Using  BPE  Algorithm
Using  WPC  Algorithm


## Training each model on **wikitext dataset** (Based on gutenberg tokenization)

In [11]:
gutenberg_dataset = ['pg16457.txt']
wikitext_dataset = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokens_dict = {}

for files in [gutenberg_dataset]:
    print(f"========Using vocabulary from corpus {files}=======")
    for alg in ['BPE', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = wikitext_test
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("Using ", alg, " Algorithm")
        print(output.tokens, "-> length of tokens :", len(output.tokens))

Using  BPE  Algorithm


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

