## Tokenizer exploration
Megatron has a bunch of tokenizer options. Let's get a sense for what they do

In [17]:
import sys
sys.path.append("..")
import data_utils
from data_utils import tokenization

In [9]:
import torch

In [25]:
# Make a dataset. These args were extracted using pdb.
kwargs = {'path_': '/Users/ben/data/tiny.json',
 'binarize_sent': False, 'delim': ',', 'drop_unlabeled': False, 'label_key': 'label',
 'lazy': True, 'loose': True, 'process_fn': None, 'text_key': 'text'}

# Below failed with a JSON decode error.
# data_utils.get_dataset(kwargs['path_'], **kwargs)
text = data_utils.lazy_array_loader(kwargs['path_'], data_type='data', map_fn=None)
print(text)

<data_utils.lazy_loader.lazy_array_loader object at 0x1a2b61f3c8>


In [26]:
kwargs = {'tokenizer_type': 'BertWordPieceTokenizer', 
        'corpus': text, #<data_utils.lazy_loader.lazy_array_loader object at 0x1a29766e80>, 
        'model_path': 'tokenizer.model', 'vocab_size': 30522, 
        'model_type': 'bert-large-uncased', 'pad_token': 0, 
        'character_coverage': 1.0, 'command_tokens': None, 'type_tokens': None, 
        'kwargs': {
            'ds_type': 'BERT', 'cache_dir': 'temp_cache_dir', 'max_preds_per_seq': 80}, 
        'tokenizer_class': 'BertWordPieceTokenizer'}
bert_tokenizer = data_utils.tokenization.BertWordPieceTokenizer(kwargs['model_type'], **kwargs)

loading BertWordPieceTokenizer ( bert-large-uncased ) from cache_dir  None


100%|██████████| 231508/231508 [00:00<00:00, 847799.93B/s]

loaded bert-large-uncased





In [44]:
print(bert_tokenizer.num_tokens, bert_tokenizer.num_type_tokens)
t = bert_tokenizer.EncodeAsTokens('lorem ipsum')
print(t.text, t.tokenization, t.asIds)
t = bert_tokenizer.EncodeAsIds('lorem ipsum hello!')
print(t.text, t.tokenization, t.asIds)

30522 2
lorem ipsum ['lore', '##m', 'ip', '##sum'] False
lorem ipsum hello! [19544, 2213, 12997, 17421, 7592, 999] True


In [48]:
%load_ext autoreload
%autoreload 2
from data_utils import bpe_encoder
e = bpe_encoder.get_encoder('117M')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
# test encoding, check vocab size
toks = e.encode('cool beans!')
print(toks)
print(len(e.bpe_ranks), len(e.byte_decoder))
print(e.decode(toks))

[24494, 16567, 0]
50000 256
cool beans!


In [64]:
# what is bert doing under the hood for EncodeAsIds?
from data_utils.tokenization import Tokenization
processed_text = 'hello world'
tokens = bert_tokenizer.text_tokenizer.tokenize(processed_text)
print(tokens)
Ids = bert_tokenizer.text_tokenizer.convert_tokens_to_ids(tokens)
print(Ids)
tok = Tokenization(Ids, processed_text, text)
print(tok, tok.tokenization)
print(bert_tokenizer.IdToToken(7592))

['hello', 'world']
[7592, 2088]
<data_utils.tokenization.Tokenization object at 0x1a2d90eda0> [7592, 2088]
hello


In [82]:
# What's inside these?
# self._tokens = list(self.text_tokenizer.vocab.keys())
print(len(bert_tokenizer._tokens), bert_tokenizer._tokens[:5])

#self._vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}
print(len(bert_tokenizer._vocab), list(bert_tokenizer._vocab.items())[:5])

# self._text_token_vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}

print(len(e.encoder), list(e.encoder.items())[:10])


30522 ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]']
30522 [('[PAD]', 0), ('[unused0]', 1), ('[unused1]', 2), ('[unused2]', 3), ('[unused3]', 4)]
50257 [('!', 0), ('"', 1), ('#', 2), ('$', 3), ('%', 4), ('&', 5), ("'", 6), ('(', 7), (')', 8), ('*', 9)]


In [85]:
from data_utils.tokenization import Tokenization, CommandToken, TypeToken

class BytePairTokenizer(data_utils.tokenization.TextTokenizer):
    """
    Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
    in BERT training. Default to bert-large-uncased tokenizer.
    """
    def __init__(self, tokenizer_model_type=None, cache_dir=None, encoder=None, **kwargs):
        self.encoder = encoder if encoder else bpe_encoder.get_encoder('117M')
        # set command tokens from wordpiece tokenizer values
        self.num_tokens = len(self.encoder.encoder)
        self.num_text_tokens = self.num_tokens

        # Probably don't need stuff below
        self._command_tokens = [
        ]
        self.num_command_tokens = len(self._command_tokens)
        
        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
        self.command_token_map = {tok.token: tok for tok in self._command_tokens}
        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}

        # set type tokens
        self.type_tokens = [
            TypeToken('str0', '<str0>', 0),
            TypeToken('str1', '<str1>', 1),
        ]
        self.num_type_tokens = len(self.type_tokens)
        
        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}

        # parse tokens and vocabs from tokenizer

        self._tokens = list(self.encoder.encoder.keys())
        self._vocab = self.encoder.encoder

        self._text_tokens = list(self._tokens)
        self._text_token_vocab = self._vocab

        self._command_token_tokens = list(self.command_token_map.keys())
        self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}

        self._token_types = list(self.type_token_map.keys())
        self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}

    def EncodeAsIds(self, text, process_fn=None):
        """convert text to wordpiece Ids"""
        processed_text = text
        if process_fn is not None:
            processed_text = process_fn(processed_text)
        #tokens = self.text_tokenizer.tokenize(processed_text)
        Ids = self.encoder.encode(processed_text)
        return Tokenization(Ids, processed_text, text)

    def EncodeAsTokens(self, text, process_fn=None):
        """convert wordpiece token to Id"""
        raise NotImplementedError()

    def IdToToken(self, Id, type_token=False):
        """convert Id to sentencpiece token"""
        raise NotImplementedError()

    def TokenToId(self, token, type_token=False):
        """convert sentencpiece token to Id"""
        raise NotImplementedError()

    def DecodeIds(self, Ids, type_token=False):
        """converts ids to wordpiece tokens and joins them as a text string"""
        if isinstance(Ids, Tokenization):
            Ids = Ids.tokenization
        return self.encoder.decode(Ids)

    def DecodeTokens(self, Tokens, type_token=False):
        if isinstance(Ids, Tokenization):
            Ids = Ids.tokenization
        return ' '.join(Tokens)
    
byte_pair_tokenizer = BytePairTokenizer(kwargs['model_type'], encoder=e, **kwargs)
byte_pair_tokenizer.EncodeAsIds('hello world!').tokenization

[31373, 995, 0]