In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence

from tokenizers.processors import TemplateProcessing

import pandas as pd

from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, BertNormalizer

In [2]:
special_tokens= ["[BLANK]", "[BOS]", "[EOS]", "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

In [3]:
## Normalize Accents and convert english to english
# normalizer = normalizers.Sequence([NFD(), StripAccents(), Lowercase()])
normalizer = normalizers.Sequence([BertNormalizer(strip_accents=True), NFD()])

In [4]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size = 500, min_frequency = 5, special_tokens = special_tokens)

tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=True)])

In [5]:
train_file = pd.read_csv('data/external/cv-corpus-8.0-2022-01-19/en/train.tsv', sep = '\t')

In [6]:
train_sentences = train_file['sentence']
train_sentences = train_sentences.drop_duplicates().dropna()

In [7]:
tokenizer.train_from_iterator(iterator = train_sentences, trainer = trainer)






In [8]:
tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[
        ("[BOS]", tokenizer.token_to_id("[BOS]")),
        ("[EOS]", tokenizer.token_to_id("[EOS]")),
    ]
)

In [9]:
tokenizer.save('data/tokenizer/trained_tokenizer.json')

In [10]:
len(tokenizer.get_vocab())

500

In [11]:
tokenizer.get_vocab()

{'i': 36,
 'я': 84,
 'ational': 468,
 'bec': 430,
 'of': 124,
 'g': 34,
 'ak': 294,
 'after': 325,
 'first': 352,
 ':': 20,
 '(': 14,
 'y': 52,
 'ct': 166,
 'pro': 184,
 '[MASK]': 7,
 'z': 53,
 'am': 138,
 'cor': 318,
 'gre': 404,
 'city': 470,
 'igh': 220,
 'es': 121,
 '[SEP]': 5,
 'coun': 323,
 'some': 327,
 'æ': 62,
 'ass': 313,
 'both': 497,
 'ors': 443,
 'there': 298,
 'ld': 229,
 'q': 44,
 'cted': 487,
 'vers': 360,
 'ok': 339,
 'ign': 346,
 'us': 164,
 'loc': 316,
 'fl': 408,
 'many': 379,
 'gen': 399,
 'ke': 211,
 'him': 424,
 'uring': 455,
 'been': 345,
 'ble': 457,
 'ited': 496,
 've': 148,
 'ber': 268,
 'ad': 143,
 'co': 142,
 'ı': 66,
 'em': 277,
 'mus': 493,
 'have': 293,
 'ick': 447,
 '´': 58,
 'his': 177,
 'only': 427,
 '”': 92,
 'call': 477,
 'during': 488,
 'ate': 216,
 'art': 409,
 'ong': 257,
 'ge': 167,
 'ous': 269,
 'thr': 474,
 'ted': 276,
 'cur': 398,
 'this': 200,
 'known': 433,
 'ild': 364,
 'lar': 462,
 'mem': 463,
 '时': 102,
 'dis': 314,
 'ap': 187,
 'ts': 18

In [12]:
tokenizer.token_to_id('f')

33

In [1]:
### Save sentence for training LM

In [None]:
train_file['sentence'].to_csv('data/internal/train_lm.txt', sep = '\n', index = None, header=None)