In [1]:
#here we look at tokenisation of chars in detail, 

#we see that models like gpt-2 work worse in other languages apart from english, firstly because of
#the fact that it was trained on a majority of english text, but also that the tokenisation
#for other languages are more in number, that means that for the same sentence in english and
#say korean, the number of tokens would be more in korean, that means that during the attention
#stage, it becomes bloated as it has the remain in the same context window, but have many more 
#tokens to work with

#same goes with python, as there are too many spaces present, hence the number of tokens becomes
#bloated up 

In [2]:
#what we did earlier was just encoding each of the chars, so there a token was a char, but now
#we need to group chars in some fashion to get tokens, and then encode them 

In [3]:
list("he".encode("utf-16"))   #here we encode the string "he" in utf-16, and we see that the
#utf-16 encoding uses 2-4 bytes per char, and here we get
#6 bytes, so we can see that the tokenisation is done in a different way

[255, 254, 104, 0, 101, 0]

In [4]:
"e".encode("utf-8")

b'e'

In [5]:
len(list("he".encode("utf-16")))  #takes up more space, as there
#is more padding, hence there are more tokens generated from this 
#as compared to utf-8

6

In [6]:
len(list("he".encode("utf-8")))

2

In [7]:
list("he".encode("utf-8"))  #uses only 2 bytes for these
#two chars, indicating that the encoding is denser as compared to utf-16

[104, 101]

In [8]:
#as we increase the utf encoding frmo 8 to 16 to 32, the number
#of padded 0s incerease, as utf-8 uses 1 to 4 bytes, utf-16 uses 2 to 4 bytes
#and utf-32 uses 4 bytes, so the number of padded 0s increase

In [9]:
list('a'.encode('utf-8'))  #here we see that the encoding for a single char

[97]

In [10]:
text = "ÔºµÔΩéÔΩâÔΩÉÔΩèÔΩÑÔΩÖ! üÖ§üÖùüÖòüÖíüÖûüÖìüÖî‚ÄΩ üá∫‚Äåüá≥‚ÄåüáÆ‚Äåüá®‚Äåüá¥‚Äåüá©‚Äåüá™! üòÑ The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äîlike using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame programmers for still finding the whole thing mysterious, even 30 years after Unicode‚Äôs inception."

#making tokens from the text, using just the utf-8 encoding
tokens = list(text.encode("utf-8"))  #raw bytes
tokens = list(map(int, tokens))  #each byte is converted to an int
print('-----------------')
print(text)
print(f'text itself has: {len(text)} chars')
print('-----------------')
print(tokens)
print("after utf-8 encoding, the number of tokens are: ", len(tokens))

-----------------
ÔºµÔΩéÔΩâÔΩÉÔΩèÔΩÑÔΩÖ! üÖ§üÖùüÖòüÖíüÖûüÖìüÖî‚ÄΩ üá∫‚Äåüá≥‚ÄåüáÆ‚Äåüá®‚Äåüá¥‚Äåüá©‚Äåüá™! üòÑ The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äîlike using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame programmers for still finding the whole thing mysterious, even 30 years after Unicode‚Äôs inception.
text itself has: 533 chars
-----------------
[239, 188, 181, 239, 189, 142, 239, 189, 137, 239, 189, 131, 239, 189, 143, 239, 189, 132, 239, 189, 133, 33, 32, 240, 159, 133, 164, 240, 159, 133, 157, 240, 159, 133, 152, 240, 159, 133, 146, 240, 159, 133, 158, 240, 159, 133, 147, 240, 159, 133, 148, 226, 128, 189, 32, 240, 159, 135, 186, 226, 128, 140, 240, 159, 13

In [11]:
#so we see that the number of tokens after just utf-8 encoding
#is more than the chars too

In [12]:
#implementing BPE, where we first find the freq of pairs of 
#bytes occuring
def get_stats(tokens):
    counts = {}
    for pair in zip(tokens, tokens[1:]):
        counts[pair] = counts.get(pair, 0) + 1
        #btw .get is a safer approach to get the value of a key
        #as if a key is not present, it returns a default value
        #which is 0 in this case
    return counts

stats = get_stats(tokens)
sorted(((v, k) for k, v in stats.items()), reverse=True)

[(20, (101, 32)),
 (15, (240, 159)),
 (12, (226, 128)),
 (12, (105, 110)),
 (10, (115, 32)),
 (10, (97, 110)),
 (10, (32, 97)),
 (9, (32, 116)),
 (8, (116, 104)),
 (7, (159, 135)),
 (7, (159, 133)),
 (7, (97, 114)),
 (6, (239, 189)),
 (6, (140, 240)),
 (6, (128, 140)),
 (6, (116, 32)),
 (6, (114, 32)),
 (6, (111, 114)),
 (6, (110, 103)),
 (6, (110, 100)),
 (6, (109, 101)),
 (6, (104, 101)),
 (6, (101, 114)),
 (6, (32, 105)),
 (5, (117, 115)),
 (5, (115, 116)),
 (5, (110, 32)),
 (5, (100, 101)),
 (5, (44, 32)),
 (5, (32, 115)),
 (4, (116, 105)),
 (4, (116, 101)),
 (4, (115, 44)),
 (4, (114, 105)),
 (4, (111, 117)),
 (4, (111, 100)),
 (4, (110, 116)),
 (4, (110, 105)),
 (4, (105, 99)),
 (4, (104, 97)),
 (4, (103, 32)),
 (4, (101, 97)),
 (4, (100, 32)),
 (4, (99, 111)),
 (4, (97, 109)),
 (4, (85, 110)),
 (4, (32, 119)),
 (4, (32, 111)),
 (4, (32, 102)),
 (4, (32, 85)),
 (3, (118, 101)),
 (3, (116, 115)),
 (3, (116, 114)),
 (3, (116, 111)),
 (3, (114, 116)),
 (3, (114, 115)),
 (3, (114, 10

In [13]:
#now we repalce the most common pair of bytes which is (101, 32) with a new token of 
#256
def merge(tokens, pair, new_index):
    new_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
            new_tokens.append(new_index)  #append the new token which represents both
            #of them now
            i += 2 #skip over that pair now
        else:
            new_tokens.append(tokens[i])  #just append the token as it is
            i += 1 # just move to the next one
    return new_tokens

In [14]:
top_pair = max(stats, key=stats.get)

In [15]:
tokens2 = merge(tokens, top_pair, 256)
print(len(tokens2))  #lesser by 20 now

596


In [16]:
#now we repeat this on tokens2, and the number of times we do it for is a hyperparameter that
#is upto us to tune, the more this value is, lesser is the total number of tokens and more is the
#context per token
  
vocab_size = 276  #desired vocab size, unique ones
num_merges = vocab_size - 256  
tokens_copy = list(tokens)

merges = {}  #just to keep a track of what merges were made during different iterations, (int, int) -> (int)

for i in range(num_merges):
    stats = get_stats(tokens_copy)
    top_pair = max(stats, key=stats.get)
    print(f"merging {top_pair} into {256 + i}")
    tokens_copy = merge(tokens_copy, top_pair, 256 + i)
    merges[top_pair] = 256 + i

merging (101, 32) into 256
merging (240, 159) into 257
merging (226, 128) into 258
merging (105, 110) into 259
merging (115, 32) into 260
merging (97, 110) into 261
merging (116, 104) into 262
merging (257, 133) into 263
merging (257, 135) into 264
merging (97, 114) into 265
merging (239, 189) into 266
merging (258, 140) into 267
merging (267, 264) into 268
merging (101, 114) into 269
merging (111, 114) into 270
merging (116, 32) into 271
merging (259, 103) into 272
merging (115, 116) into 273
merging (261, 100) into 274
merging (32, 262) into 275


In [17]:
len(tokens_copy)  #in this case it has 451 tokens, but the unique ones are 276, as intended

451

### decoding

In [18]:
vocab = {idx: bytes([idx]) for idx in range(256)}  #maps each number to its byte representation
#in hexadecimal
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [19]:
merges.items()

dict_items([((101, 32), 256), ((240, 159), 257), ((226, 128), 258), ((105, 110), 259), ((115, 32), 260), ((97, 110), 261), ((116, 104), 262), ((257, 133), 263), ((257, 135), 264), ((97, 114), 265), ((239, 189), 266), ((258, 140), 267), ((267, 264), 268), ((101, 114), 269), ((111, 114), 270), ((116, 32), 271), ((259, 103), 272), ((115, 116), 273), ((261, 100), 274), ((32, 262), 275)])

In [20]:
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1] #getting the byte representation for the merges and other 
    #unmerged tokens(they are left as is)
    #so initially we have the normal byte representations of all the 275 numbers but since from 256 onwards
    #we have merged tokens, we change the byte representation of those tokens to the byte representation of the
    #pair of tokens that were merged to get that token, and that is given by the line 
    #vocab[idx] = vocab[p0] + vocab[p1]

In [21]:
vocab  #now all the numbers from 256 onwards have the byte
# representation of the pair of tokens that were merged

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [22]:
tokens = b"".join(vocab[idx] for idx in tokens)
tokens

b'\xef\xbc\xb5\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x83\xef\xbd\x8f\xef\xbd\x84\xef\xbd\x85! \xf0\x9f\x85\xa4\xf0\x9f\x85\x9d\xf0\x9f\x85\x98\xf0\x9f\x85\x92\xf0\x9f\x85\x9e\xf0\x9f\x85\x93\xf0\x9f\x85\x94\xe2\x80\xbd \xf0\x9f\x87\xba\xe2\x80\x8c\xf0\x9f\x87\xb3\xe2\x80\x8c\xf0\x9f\x87\xae\xe2\x80\x8c\xf0\x9f\x87\xa8\xe2\x80\x8c\xf0\x9f\x87\xb4\xe2\x80\x8c\xf0\x9f\x87\xa9\xe2\x80\x8c\xf0\x9f\x87\xaa! \xf0\x9f\x98\x84 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to \xe2\x80\x9csupport Unicode\xe2\x80\x9d in our software (whatever that means\xe2\x80\x94like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don\xe2\x80\x99t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode\xe2\x80\x99s inception.'

In [23]:
tokens.decode("utf-8", errors="replace")

'ÔºµÔΩéÔΩâÔΩÉÔΩèÔΩÑÔΩÖ! üÖ§üÖùüÖòüÖíüÖûüÖìüÖî‚ÄΩ üá∫\u200cüá≥\u200cüáÆ\u200cüá®\u200cüá¥\u200cüá©\u200cüá™! üòÑ The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to ‚Äúsupport Unicode‚Äù in our software (whatever that means‚Äîlike using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don‚Äôt blame programmers for still finding the whole thing mysterious, even 30 years after Unicode‚Äôs inception.'

In [24]:
def decode(ids):
    tokens = b"".join(vocab[idx] for idx in ids)
    text = tokens.decode("utf-8", errors='replace')  #to replace unconforming bits with special char
    return text

In [25]:
print(decode([97, 97])) 

aa


### encoding

In [26]:
def encode(text):
    tokens = list(text.encode("utf-8"))
    i = 256
    while len(tokens) >= 2:   #to ensure that a pair atleast exists
        stats = get_stats(tokens)
        top_pair = max(stats, key=stats.get)
        if stats[top_pair] == 1:
            break
        tokens = merge(tokens, top_pair, i)
        print(f"merging {top_pair} into {i}")
        i += 1
    return tokens

In [27]:
t = encode("hello world")
t

[104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100]

In [28]:
d = decode(t)
d

'hello world'

In [29]:
#done!!

In [30]:
t1 = encode("h")

In [31]:
t1  #handle a single char case too 

[104]

#### now looking at tokensisations of actual implemented GPTs

##### forced splits using regex patterns(GPT series)

In [32]:
#gpt2 first split up the text into chunks and then applied a form of BPE on each of the chunks
import regex as re

gpt2pattern = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
#\p{N} matches anu numeric char in any script
#\p{L} matches any letter in any script apart from spaces and numbers
#(?!\S) matches a whitespace character that is not followed by a non-whitespace character, 
#used as a negative lookahead assertion, where it asserts that the current position is not followed by a non-whitespace character

print(re.findall(gpt2pattern, "hello've world123 !!!"))

#this also does not handle upper cases the same manner which can be a hinderance

['hello', "'ve", ' world', '123', ' !!!']


In [33]:
#this is the approach used by tiktoken which GPT4 uses, another approach exists called
#sentencepiece where it runs BPE on the unicode points directly 

In [35]:
#now using sentencepiece module to tokenise the text, with this training and inference 
#can be done regarding the tokenisation, unlike tiktoken which can just do inference
import sentencepiece as spm

# write a toy.txt file with some random text
with open("toy.txt", "w", encoding="utf-8") as f:
  f.write("SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.")

In [36]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training Llama 2
import os

options = dict(
  # input spec
  input="toy.txt",
  input_format="text",
  # output spec
  model_prefix="tok400", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=400,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True, #adds _ at the start of the token, so as to say treat
  #"world" in "world" and "world" in "hello world" the same way
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: toy.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ‚Åá 
  enable_differential_privacy: 0
  dif

In [38]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [40]:
ids = sp.encode("hello ÏïàÎÖïÌïòÏÑ∏Ïöî")
print(ids)
#the korean chars were not a part of the training set, but still due to byte_fallback, they
#are represented by their byte representations, and this is a speciality of sentencepiece

[362, 378, 361, 372, 358, 362, 239, 152, 139, 238, 136, 152, 240, 152, 155, 239, 135, 187, 239, 157, 151]


In [41]:
print([sp.id_to_piece(idx) for idx in ids])

['‚ñÅ', 'h', 'e', 'l', 'lo', '‚ñÅ', '<0xEC>', '<0x95>', '<0x88>', '<0xEB>', '<0x85>', '<0x95>', '<0xED>', '<0x95>', '<0x98>', '<0xEC>', '<0x84>', '<0xB8>', '<0xEC>', '<0x9A>', '<0x94>']


the protocols used in Llama 2 model's tokensiation is

```
    normalizer_spec {
    name: "identity"
    precompiled_charsmap: ""
    add_dummy_prefix: true
    remove_extra_whitespaces: false
    normalization_rule_tsv: ""
    }

    trainer_spec {
    input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
    model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
    model_type: BPE
    vocab_size: 32000
    self_test_sample_size: 0
    input_format: "text"
    character_coverage: 0.99995
    input_sentence_size: 200000000
    seed_sentencepiece_size: 1000000
    shrinking_factor: 0.75
    num_threads: 80
    num_sub_iterations: 2
    max_sentence_length: 4192
    shuffle_input_sentence: true
    max_sentencepiece_length: 16
    split_by_unicode_script: true
    split_by_whitespace: true
    split_by_number: true
    treat_whitespace_as_suffix: false
    split_digits: true
    allow_whitespace_only_pieces: true
    vocabulary_output_piece_score: true
    hard_vocab_limit: true
    use_all_vocab: false
    byte_fallback: true
    required_chars: ""
    unk_id: 0
    bos_id: 1
    eos_id: 2
    pad_id: -1
    unk_surface: " \342\201\207 "
    unk_piece: "<unk>"
    bos_piece: "<s>"
    eos_piece: "</s>"
    pad_piece: "<pad>"
    train_extremely_large_corpus: false
    enable_differential_privacy: false
    differential_privacy_noise_level: 0.0
    differential_privacy_clipping_threshold: 0
}
```