-
Notifications
You must be signed in to change notification settings - Fork 3
/
train_tokenizer.py
35 lines (26 loc) · 1.39 KB
/
train_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sentencepiece as spm
## Specify which tokenizer to train (base/large)
tokenizer_size = "base"
## Train
if tokenizer_size == "base":
spm.SentencePieceTrainer.train(input="./ko_corpus_base.txt",
model_prefix="ko_tokenizer_base",
vocab_size=32000,
pad_id=3,
unk_piece="[UNK]",
pad_piece="[PAD]",
user_defined_symbols=["[SEP]", "[CLS]", "[MASK]", "[X_SEP]"],
input_sentence_size=0,
shuffle_input_sentence=True,
train_extremely_large_corpus=True)
else:
spm.SentencePieceTrainer.train(input="./ko_corpus_large.txt",
model_prefix="ko_tokenizer_large",
vocab_size=32000,
pad_id=3,
unk_piece="[UNK]",
pad_piece="[PAD]",
user_defined_symbols=["[SEP]", "[CLS]", "[MASK]", "[X_SEP]"],
input_sentence_size=0,
shuffle_input_sentence=True,
train_extremely_large_corpus=True)