In [None]:
from datasets import load_dataset

In [None]:
dataset=load_dataset('wikitext',name='wikitext-2-raw-v1',split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [None]:
def training_corpus():
  for i in range(0,len(dataset),1000):
    yield dataset[i:i+1000]['text']

WordPiece tokenizer

In [None]:
from tokenizers import (decoders,models,normalizers,pre_tokenizers,processors,trainers,Tokenizer,)

In [None]:
tokenizer=Tokenizer(models.WordPiece(unk_token='[UNK]'))

In [None]:
tokenizer.normalizer=normalizers.Sequence(
    [normalizers.NFD(),normalizers.Lowercase(),normalizers.StripAccents()]
)

In [None]:
tokenizer.normalizer.normalize_str('Héllò hôw are ü?')

'hello how are u?'

In [None]:
tokenizer.pre_tokenizer=pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(),pre_tokenizers.Punctuation()])

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str("where's my money richard")

[('where', (0, 5)),
 ("'", (5, 6)),
 ('s', (6, 7)),
 ('my', (8, 10)),
 ('money', (11, 16)),
 ('richard', (17, 24))]

In [None]:
sp_tokens=['[UNK]','[PAD]','[CLS]','[SEP]','[MASK]']
trainer=trainers.WordPieceTrainer(vocab_size=25000,special_tokens=sp_tokens)

In [None]:
tokenizer.train_from_iterator(training_corpus(),trainer=trainer)

In [None]:
encoding=tokenizer.encode('tokenize this fast, faster, fastest')

In [None]:
encoding.tokens

['tok', '##eni', '##ze', 'this', 'fast', ',', 'faster', ',', 'fastest']

In [None]:
encoding.offsets

[(0, 3),
 (3, 6),
 (6, 8),
 (9, 13),
 (14, 18),
 (18, 19),
 (20, 26),
 (26, 27),
 (28, 35)]

In [None]:
encoding.word_ids

[0, 0, 0, 1, 2, 3, 4, 5, 6]

In [None]:
cls=tokenizer.token_to_id('[CLS]')
sep=tokenizer.token_to_id('[SEP]')
cls,sep

(2, 3)

In [None]:
tokenizer.post_processor=processors.TemplateProcessing(
    single=f'[CLS]:0 $A:0 [SEP]:1',
    pair=f'[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1',
    special_tokens=[('[CLS]',cls),('[SEP]',sep)]
)

In [None]:
encoding=tokenizer.encode('tokeinze this fast', 'faster, fastest')

In [None]:
encoding.tokens

['[CLS]',
 'tok',
 '##ei',
 '##n',
 '##ze',
 'this',
 'fast',
 '[SEP]',
 'faster',
 ',',
 'fastest',
 '[SEP]']

In [None]:
encoding.type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]

In [None]:
encoding.word_ids

[None, 0, 0, 0, 0, 1, 2, None, 0, 1, 2, None]

In [None]:
encoding.ids

[2, 24300, 20612, 793, 3325, 1511, 4884, 3, 10629, 16, 10729, 3]

In [None]:
tokenizer.decoder=decoders.WordPiece(prefix='##')

In [None]:
tokenizer.decode(encoding.ids)

'tokeinze this fast faster, fastest'

In [None]:
tokenizer.save('tokenizer.json')

In [None]:
new=Tokenizer.from_file('tokenizer.json')

In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
wraped=PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    mask_token='[MASK]'
)

In [None]:
wraped.save_pretrained('./wordpiecetokenizer')

('./wordpiecetokenizer/tokenizer_config.json',
 './wordpiecetokenizer/special_tokens_map.json',
 './wordpiecetokenizer/tokenizer.json')

Byte Pair Encoding

In [None]:
tokenizer=Tokenizer(models.BPE())

In [None]:
tokenizer.pre_tokenizer=pre_tokenizers.ByteLevel(add_prefix_space=False)

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str("don't go gently-into the night")

[('don', (0, 3)),
 ("'t", (3, 5)),
 ('Ġgo', (5, 8)),
 ('Ġgently', (8, 15)),
 ('-', (15, 16)),
 ('into', (16, 20)),
 ('Ġthe', (20, 24)),
 ('Ġnight', (24, 30))]

In [None]:
trainer=trainers.BpeTrainer(vocab_size=25000,special_tokens=['<|endoftext|>'])

In [None]:
tokenizer.train_from_iterator(training_corpus(),trainer=trainer)

In [None]:
encoding=tokenizer.encode("let's get some shit-done")

In [None]:
encoding.tokens

['let', "'", 's', 'Ġget', 'Ġsome', 'Ġsh', 'it', '-', 'd', 'one']

In [None]:
encoding.ids

[1410, 7, 83, 2085, 766, 392, 211, 13, 68, 588]

In [None]:
encoding.offsets

[(0, 3),
 (3, 4),
 (4, 5),
 (5, 9),
 (9, 14),
 (14, 17),
 (17, 19),
 (19, 20),
 (20, 21),
 (21, 24)]

In [None]:
tokenizer.post_processor=processors.ByteLevel(trim_offsets=False)

In [None]:
sent="let's get some shit-done"
start,end=encoding.offsets[5]
sent[start:end]

' sh'

In [None]:
tokenizer.decoder=decoders.ByteLevel()

In [None]:
tokenizer.decode(encoding.ids)

"let's get some shit-done"

In [None]:
wraped2=PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token='<|endoftext|>',
    eos_token='<|endoftext|>'
)

In [None]:
wraped2.save_pretrained('./bpetokenizer')

('./bpetokenizer/tokenizer_config.json',
 './bpetokenizer/special_tokens_map.json',
 './bpetokenizer/tokenizer.json')

Unigram tokenizer

In [None]:
tokenizer=Tokenizer(models.Unigram())

In [None]:
from tokenizers import Regex

In [None]:
tokenizer.normalizer=normalizers.Sequence(
    [
        normalizers.Replace("``",'"'),
        normalizers.Replace("''",'"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}")," ")
    ]
)

In [None]:
tokenizer.pretokenizer=pre_tokenizers.Metaspace()

In [None]:
spl=['<cls','<sep>','<unk>','<pad>','<mask>','<s>','</s>']
trainer=trainers.UnigramTrainer(
    vocab_size=25000,special_tokens=spl,unk_token='<unk>'
)

In [None]:
tokenizer.train_from_iterator(training_corpus(),trainer=trainer)

In [None]:
encoding=tokenizer.encode('my captain, oh captain')
encoding.tokens

['my ', 'cap', 'ta', 'in', ', ', 'oh', ' ', 'cap', 'ta', 'in']

In [None]:
cls=tokenizer.token_to_id("<cls>")
sep=tokenizer.token_to_id('<sep>')
cls,sep

(None, 1)

In [64]:
tokenizer.post_processor=processors.TemplateProcessing(
    single='$A:0 <sep>:0 <cls>:2',
    pair='$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2',
    special_tokens=[('<sep>',0),('<cls>',1)],
)

In [65]:
tokenizer.decoder=decoders.Metaspace()

In [66]:
wraped3=PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token='<s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    cls_token='<cls>',
    sep_token='<sep>',
    mask_token='<mask>',
    padding_side='left'
)

In [67]:
wraped3.save_pretrained('./xlnettokenizer')

('./xlnettokenizer/tokenizer_config.json',
 './xlnettokenizer/special_tokens_map.json',
 './xlnettokenizer/tokenizer.json')

In [68]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [72]:
wraped.save_pretrained("tokenizers/basic_wordpiece")
wraped2.save_pretrained("tokenizers/basic_bpe")
wraped3.save_pretrained("tokenizers/basic_xlnet")

('tokenizers/basic_xlnet/tokenizer_config.json',
 'tokenizers/basic_xlnet/special_tokens_map.json',
 'tokenizers/basic_xlnet/tokenizer.json')

In [73]:
from huggingface_hub import HfApi

api = HfApi()
repo_id = "ByteMeHarder-404/tokenizers"

# Create repo if not exists
api.create_repo(repo_id=repo_id, private=False, exist_ok=True)

# Push entire folder
api.upload_folder(
    folder_path="tokenizers",
    repo_id=repo_id,
    commit_message="Upload WordPiece, BPE, and XLNet tokenizers"
)


CommitInfo(commit_url='https://huggingface.co/ByteMeHarder-404/tokenizers/commit/4713739cb1443d76b539206aedb1f8c158d70f8b', commit_message='Upload WordPiece, BPE, and XLNet tokenizers', commit_description='', oid='4713739cb1443d76b539206aedb1f8c158d70f8b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ByteMeHarder-404/tokenizers', endpoint='https://huggingface.co', repo_type='model', repo_id='ByteMeHarder-404/tokenizers'), pr_revision=None, pr_num=None)