In [1]:
import pandas as pd
import sentencepiece as spm
from collections import Counter

In [2]:
from tokenizers import SentencePieceBPETokenizer

In [3]:
def read_articles():
    text = ""
    for i in range(13, 25):
        dt = pd.read_csv(f"dataMans/LUIMA-SERIPOS_20{i}_RUS_MANS.csv")
        mans = dt.mans.tolist()
        for item in mans:
            text += item.replace("\n", " ").strip() + "<eos>"
    return text

In [4]:
def read_vocab():
    vocab_1 = pd.read_csv("dataMans/DICTIONARY_MANS_RUS_new.csv")
    vocab_2 = pd.read_csv("dataMans/DICTIONARY_MANS_RUS_2.csv")
    vocab_2 = vocab_2.dropna(subset=["rus"])
    rus_translate = vocab_2.rus.tolist()
    mans_translate = vocab_2.mans.tolist()
    note_translate = vocab_2.note.tolist()
    for i in range(len(rus_translate)):
        if "гл. прист." in rus_translate[i]:
            # print(i)
            rus_translate[i] = rus_translate[i].replace("гл. прист.", note_translate[i])
            note_translate[i] = ""
        if "гл. приставка" in rus_translate[i]:
            rus_translate[i] = rus_translate[i].replace(
                "гл. приставка", note_translate[i]
            )
            note_translate[i] = ""
        if "гл.прист." in rus_translate[i]:
            # print(i)
            rus_translate[i] = rus_translate[i].replace("гл.прист.", note_translate[i])
            note_translate[i] = ""
    for i in range(len(rus_translate)):
        for n in ["III.", "II.", "I.", "I", "II", "III"]:
            # print(i)
            if n in rus_translate[i]:
                # print(i)
                rus_translate[i] = rus_translate[i].replace(n, "")
            if not isinstance(note_translate[i], float):
                # print(note_translate[i])
                if n in note_translate[i]:
                    note_translate[i] = note_translate[i].replace(n, "")
        rus_translate[i] = rus_translate[i].strip()
        if not isinstance(note_translate[i], float):
            note_translate[i] = note_translate[i].strip()
    bad = [
        "4)",
        ") 2)",
        "),2)",
        ") ,2)",
        ") 3)",
        ");2)",
        "); 2)",
        "2)",
        "),1)",
        ");",
        ")",
    ]
    for i in range(len(note_translate)):
        if isinstance(note_translate[i], float):
            continue
        clean_last = False
        for n in bad:
            if n in note_translate[i]:
                # print(note_translate[i])
                note_translate[i] = note_translate[i].replace(n, ",")
                clean_last = True
        if clean_last:
            # print(note_translate[i])
            res = note_translate[i].split(",")[:-1]
            note_translate[i] = []
            for elem in res:
                if elem != " " or elem != "":
                    note_translate[i].append(elem)
            note_translate[i] = (", ".join(note_translate[i])).strip()
            if note_translate[i][-1] == ",":
                note_translate[i] = note_translate[i][:-1]

    vocab_2.rus = rus_translate
    vocab_2.mans = mans_translate
    vocab_2.note = note_translate

    rus_translate = vocab_1.rus.tolist()
    mans_translate = vocab_1.mans.tolist()
    note_translate = vocab_1.note.tolist()

    rus_translate[47] = "один"
    note_translate[47] = "акв хум - один мужчина"

    rus_translate[903] = "ещё"
    note_translate[903] = "неа̄сюм иӈыт ёхты - отец ещё не приехал"

    vocab_1.rus = rus_translate
    vocab_1.mans = mans_translate
    vocab_1.note = note_translate

    vocab = pd.concat([vocab_1, vocab_2])

    rus_translate = vocab.rus.tolist()
    mans_translate = vocab.mans.tolist()
    note_translate = vocab.note.tolist()

    words = {}
    for i in range(len(rus_translate)):
        s = f"{mans_translate[i]}@@@{rus_translate[i]}"
        if s not in words:
            words[s] = []
        if isinstance(note_translate[i], float):
            continue
        words[s].append(note_translate[i])

    rus_translate = []
    mans_translate = []
    note_translate = []
    for key in words:
        m, r = key.split("@@@")
        mans_translate.append(m)
    return "<eos>".join(mans_translate)
    

def read_other():
    text = ""
    df_komi = pd.read_csv("dataMans/komi-rus.csv")
    komi = df_komi["Коми язык"].tolist()[:10000]
    text += "<eos>".join(komi)
    df_udmurt = pd.read_csv("dataMans/udmurt-rus.csv")
    udm = df_udmurt.udm.tolist()[:10000]
    text += "<eos>".join(udm)
    df_mhr = pd.read_csv("dataMans/mhr-rus.csv")
    df_mhr = df_mhr.dropna(subset=["mhr"])
    mhr = df_mhr.mhr.tolist()[:10000]
    text += "<eos>".join(mhr)

    df_text = pd.read_csv("dataMans/READ_LITER.csv")
    text += "<eos>".join(df_text.mans.tolist())

    df_text = pd.read_csv("dataMans/Gospel_Mark_RUS_MANS.csv")
    text += "<eos>".join(df_text.mans.tolist())

    df_text = pd.read_csv("dataMans/Book_of_John_RUS_MANS.csv")
    text += "<eos>".join(df_text.mans.tolist())

    df_text = pd.read_csv("dataMans/Bible_UDM_RUS.csv")
    df_text = df_text.dropna(subset=["udm"])
    text += "<eos>".join(df_text.udm.tolist())

    df_text = pd.read_csv("dataMans/train.csv")
    text += "<eos>".join(df_text.target.tolist())

    df_text = pd.read_csv("dataMans/val.csv")
    text += "<eos>".join(df_text.target.tolist())

    df_text = pd.read_csv("dataMans/test.csv")
    text += "<eos>".join(df_text.target.tolist())
    return text

In [5]:
text = read_articles()
text += read_vocab() + "<eos>"
text += read_other()

In [6]:
chars_cnt = Counter(text)
required_chars = ''.join([
    k for k, v in chars_cnt.most_common() 
    if v >= 3 and k not in ' '
])

In [7]:
required_chars

'атынсломврикуе̄пэāгхь,.яздōйoes<>щёӯӈ-Тӧш\xadбАМючС\xa0ъӥКжИ\t«Н»:ХВОП–1цРф20ēЛ;ӣ9ДЕ?ЮӟГЯ!УЭ5—і34Бӵ876ӱ)Ш(Ф"”ӝЗҥӮЁЩЧ[]ЙЦ+aĀЖVIЫ_CӢDO%І…rЬtӞ№Ӧiun/\nlm*ŌXhcp¬wdӜyg“Ӵk\uf512xbRSӰAv̈TPMӤЪ„f@NEBĒQzWLFGU°JKH‑ӇӑZ\\jöū‒~#Y=ӆqá’\'$ў|әé−Ҥ'

In [8]:
file = open('data_all.txt', 'w') 
file.write(text) 
file.close() 

In [9]:
SPM_PREFIX = 'spm_man'
# special_tokens = ["<UNK>"]
spm.SentencePieceTrainer.Train(
    input='data_all.txt',
    model_prefix=SPM_PREFIX,
    vocab_size=16000,  # 16K
    character_coverage = 1,
    num_threads=20,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192*4,
    # special_tokens=special_tokens,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=required_chars,
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: data_all.txt
  input_format: 
  model_prefix: spm_man
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 16768
  num_threads: 20
  num_sub_iterations: 2
  max_sentencepiece_length: 128
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: атынсломврикуе̄пэāгхь,.яздōйoes<>щёӯӈ-Тӧш­бАМючС ъӥКжИ	«Н»:ХВОП–1цРф20ēЛ;ӣ9ДЕ?ЮӟГЯ!УЭ5—і34Бӵ876ӱ)Ш(Ф"”ӝЗҥӮЁЩЧ[]ЙЦ+aĀЖVIЫ_CӢDO%І…rЬtӞ№Ӧiun/
lm*ŌXhcp¬wdӜyg“ӴkxbRSӰAv̈TPMӤЪ„f@NEBĒQzWLFGU°JKH‑ӇӑZ\jöū‒~#Y=ӆqá’'$ў|әé���Ҥ
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_voc

RuntimeError: Internal: src/trainer_interface.cc(662) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (16000). Please set it to a value <= 4601.