In [6]:
import sentencepiece as spm

# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=amharic.txt --model_prefix=m --vocab_size=2000')

# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# encode: text => id
print(sp.encode_as_pieces('በስተቀር የትኛውም አውሬ አያደርገውም'))
print(sp.encode_as_ids('በስተቀር የትኛውም አውሬ አያደርገውም'))

['▁በስተቀር', '▁የትኛው', 'ም', '▁አውሬ', '▁አያ', 'ደርገው', 'ም']
[1038, 1421, 5, 1343, 422, 898, 5]


## Train sentencePiece for Amharic using BPE (Byte Pair Encoding)

In [12]:

spm.SentencePieceTrainer.train(input="amharic.txt", model_prefix='am-bpe', vocab_size=16000, model_type="bpe", character_coverage=1.0)

model = spm.SentencePieceProcessor(model_file='am-bpe.model')
model.encode('በስተቀር የትኛውም አውሬ አያደርገውም', out_type=str)


['▁በስተቀር', '▁የትኛውም', '▁አውሬ', '▁አያ', 'ደርገው', 'ም']

## Train sentencePiece for Amharic using Unigram.

In [18]:
spm.SentencePieceTrainer.train(input="amharic.txt", model_prefix='am-unigram', vocab_size=1000, model_type="unigram", character_coverage=1.0)
model = spm.SentencePieceProcessor(model_file='am-unigram.model')
model.encode('በስተቀር የትኛውም አውሬ አያደርገውም', out_type=str)
sp.load('am-unigram.model')

['▁በስተቀር', '▁የ', 'ት', 'ኛ', 'ው', 'ም', '▁አ', 'ው', 'ሬ', '▁አ', 'ያ', 'ደርገው', 'ም']

## Train sentencePiece for Amharic using words

In [19]:
spm.SentencePieceTrainer.train(input="amharic.txt", model_prefix='am-word', vocab_size=16000, model_type="word", character_coverage=1.0)
model = spm.SentencePieceProcessor(model_file='am-word.model')
model.encode('በስተቀር የትኛውም አውሬ አያደርገውም', out_type=str)


['▁በስተቀር', '▁የትኛውም', '▁አውሬ', '▁አያደርገውም']