# Sentencepiece
 
 + Commonly used because (unlike tiktoken) it can efficiently both train and inference BPE tokenizers. 
 
 + It is used in both Llama and Mistral series.


 + **The big difference**: 
   
   * sentencepiece runs BPE on the Unicode code points directly! 
   
   * It then has an option character_coverage for what to do with very very rare codepoints that appear very few times, 
   
   * and it either maps them onto an UNK token, or if byte_fallback is turned on, it encodes them with utf-8 and then encodes the raw bytes instead.

# TLDR:

   * tiktoken encodes to utf-8 and then BPEs bytes


   * sentencepiece BPEs the code points and optionally falls back to utf-8 bytes for rare code points (rarity is determined by character_coverage hyperparameter), 
   
   
   * which then get translated to byte tokens.


In [1]:
import sentencepiece as spm
import os

options = dict(
  # input spec
  input='random.txt',
  input_format="text",
  # output spec
  model_prefix="tok400", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=400,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: random.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 4
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  di

In [2]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.IdToPiece(ix)] for ix in range(sp.GetPieceSize())]
vocab

[['<unk>'],
 ['<s>'],
 ['</s>'],
 ['<0x00>'],
 ['<0x01>'],
 ['<0x02>'],
 ['<0x03>'],
 ['<0x04>'],
 ['<0x05>'],
 ['<0x06>'],
 ['<0x07>'],
 ['<0x08>'],
 ['<0x09>'],
 ['<0x0A>'],
 ['<0x0B>'],
 ['<0x0C>'],
 ['<0x0D>'],
 ['<0x0E>'],
 ['<0x0F>'],
 ['<0x10>'],
 ['<0x11>'],
 ['<0x12>'],
 ['<0x13>'],
 ['<0x14>'],
 ['<0x15>'],
 ['<0x16>'],
 ['<0x17>'],
 ['<0x18>'],
 ['<0x19>'],
 ['<0x1A>'],
 ['<0x1B>'],
 ['<0x1C>'],
 ['<0x1D>'],
 ['<0x1E>'],
 ['<0x1F>'],
 ['<0x20>'],
 ['<0x21>'],
 ['<0x22>'],
 ['<0x23>'],
 ['<0x24>'],
 ['<0x25>'],
 ['<0x26>'],
 ['<0x27>'],
 ['<0x28>'],
 ['<0x29>'],
 ['<0x2A>'],
 ['<0x2B>'],
 ['<0x2C>'],
 ['<0x2D>'],
 ['<0x2E>'],
 ['<0x2F>'],
 ['<0x30>'],
 ['<0x31>'],
 ['<0x32>'],
 ['<0x33>'],
 ['<0x34>'],
 ['<0x35>'],
 ['<0x36>'],
 ['<0x37>'],
 ['<0x38>'],
 ['<0x39>'],
 ['<0x3A>'],
 ['<0x3B>'],
 ['<0x3C>'],
 ['<0x3D>'],
 ['<0x3E>'],
 ['<0x3F>'],
 ['<0x40>'],
 ['<0x41>'],
 ['<0x42>'],
 ['<0x43>'],
 ['<0x44>'],
 ['<0x45>'],
 ['<0x46>'],
 ['<0x47>'],
 ['<0x48>'],
 ['<0x49>'],
 ['<0

# If we set to False [byte_fallback=True,]

  * The whole Korean char would be ==> 0, 

  * Cos all these tokens are UNK [unknown Tokens],

  * IN out training dataset, These Korean  Chars didnt Apears,, so itll set to 0/UNK, Cos byte_fallback False

In [3]:
ids = sp.encode("hello 안녕하세요")
print(ids)

[358, 266, 367, 284, 358, 239, 152, 139, 238, 136, 152, 240, 152, 155, 239, 135, 187, 239, 157, 151]


In [4]:
print([sp.IdToPiece(ix) for ix in ids])

['▁', 'he', 'l', 'lo', '▁', '<0xEC>', '<0x95>', '<0x88>', '<0xEB>', '<0x85>', '<0x95>', '<0xED>', '<0x95>', '<0x98>', '<0xEC>', '<0x84>', '<0xB8>', '<0xEC>', '<0x9A>', '<0x94>']
