In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence

from tokenizers.processors import TemplateProcessing

import pandas as pd

from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase

In [2]:
special_tokens= ["[BLANK]", "[BOS]", "[EOS]", "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

In [3]:
## Normalize Accents and convert english to english
normalizer = normalizers.Sequence([NFD(), StripAccents(), Lowercase()])

In [4]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size = 35, min_frequency = 5, special_tokens = special_tokens)

tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=True)])

In [5]:
train_file = pd.read_csv('data/external/cv-corpus-8.0-2022-01-19/en/train.tsv', sep = '\t')

In [6]:
train_sentences = train_file['sentence']
train_sentences = train_sentences.drop_duplicates().dropna()

In [7]:
tokenizer.train_from_iterator(iterator = train_sentences, trainer = trainer)






In [8]:
tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[
        ("[BOS]", tokenizer.token_to_id("[BOS]")),
        ("[EOS]", tokenizer.token_to_id("[EOS]")),
    ]
)

In [9]:
tokenizer.save('data/tokenizer/trained_tokenizer.json')

In [10]:
len(tokenizer.get_vocab())

107

In [11]:
tokenizer.get_vocab()

{'л': 81,
 'w': 50,
 '€': 95,
 '大': 100,
 'm': 40,
 '先': 99,
 '#': 10,
 'α': 71,
 'ł': 67,
 '”': 92,
 'в': 76,
 'н': 82,
 '~': 54,
 '尚': 101,
 'j': 37,
 't': 47,
 'r': 45,
 'е': 77,
 '’': 90,
 '%': 11,
 'p': 43,
 '≡': 97,
 ')': 15,
 '&': 12,
 's': 46,
 'x': 51,
 '´': 58,
 'e': 32,
 '[CLS]': 4,
 '(': 14,
 '-': 17,
 '=': 22,
 '/': 19,
 'a': 28,
 'ð': 63,
 'а': 75,
 'ь': 83,
 '¡': 55,
 'я': 84,
 '[EOS]': 2,
 'z': 53,
 'c': 30,
 'v': 49,
 '生': 103,
 'к': 80,
 '阪': 105,
 '[SEP]': 5,
 'þ': 65,
 'u': 48,
 'n': 41,
 '§': 56,
 'g': 34,
 'd': 31,
 'h': 35,
 '[BOS]': 1,
 '"': 9,
 '·': 59,
 'ø': 64,
 'ע': 86,
 '[UNK]': 3,
 '[BLANK]': 0,
 'נ': 85,
 'κ': 72,
 '‘': 89,
 '都': 104,
 '!': 8,
 ';': 21,
 '[MASK]': 7,
 '–': 87,
 'ß': 61,
 '“': 91,
 '—': 88,
 'œ': 68,
 ':': 20,
 'f': 33,
 'и': 79,
 '…': 94,
 'ﬂ': 106,
 '_': 26,
 ']': 25,
 '[PAD]': 6,
 'ı': 66,
 '[': 24,
 'b': 29,
 'ʻ': 70,
 'χ': 74,
 'y': 52,
 'i': 36,
 '«': 57,
 '„': 93,
 '京': 98,
 '`': 27,
 '时': 102,
 'π': 73,
 'k': 38,
 '→': 96,
 '»': 60

In [21]:
tokenizer.token_to_id('f')

59