In this notebook, I show how to fine-tune a NLLB-200 machine translation model for a new language.

The new language will be Mansi, and I will use a Tyvan-Russian parallel corpus as the training data.

I am running this notebook on Google Colab with a T4 GPU that has 15Gb of memory. If you run it elsewhere, you may want to adjust the batch size, so that there are no OOM errors, but the GPU is well utilized.

Installing dependencies:
* `transformers`, as a neural network framework
* `sentencepiece`, a backend for my tokenizer (the algorithm for converting a text into symbols from the model's vocabulary)
* `sacremoses`, a package required for text preprocessing with which NLLB models were pretrained.
* `sacrebleu`, a package for evaluating translation models

In [3]:
import locale
def gpe(x=None):
    return "UTF-8"
locale.getpreferredencoding = gpe

In [None]:
# pip install --upgrade protobuf

In [2]:
!pip install sentencepiece transformers datasets sacremoses sacrebleu matplotlib protobuf -q

[0m

In [4]:
import random
import re

import pandas as pd
import numpy as np

import re
import sys
import typing as tp
import unicodedata
from sacremoses import MosesPunctNormalizer


from transformers import NllbTokenizer
from tqdm.auto import tqdm, trange
from transformers import AutoModelForSeq2SeqLM
from transformers import NllbTokenizer
import gc
import random
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup

import transformers
transformers.__version__

'4.44.2'

# Preprocessing the data

In this section, I try to understand what is the training data that I have, and how suitable it is for fine-tuning a NLLB model.

In [5]:
def process_and_save_splits(data_path: str):
    """
    Process the input CSV file by renaming columns, splitting into train, val, test, 
    and saving each split with a custom file naming pattern.
    
    Args:
        data_path (str): Path to the input CSV file.
    
    Returns:
        tuple: Tuple containing train, val, and test DataFrames.
    """
    # –ß—Ç–µ–Ω–∏–µ —Ñ–∞–π–ª–∞
    df = pd.read_csv(data_path)
    print(f"data.shape unprocessed: {df.shape}")
    print(f"df.columns unprocessed: {df.columns}")

    # –ü–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ –∫–æ–ª–æ–Ω–æ–∫
    df = df.rename(columns={
        'Unnamed: 0': 'row_id',
        'target': 'mansi',
        'source': 'ru'
    })

    df['ind'] = df.index

    # –°–ø–ª–∏—Ç –¥–∞–Ω–Ω—ã—Ö
    np.random.seed(42)
    df['split'] = np.random.choice(['train', 'val', 'test'], size=len(df), p=[0.9, 0.05, 0.05])

    print(f"data.shape processed: {df.shape}")
    print(f"df.columns processed: {df.columns}")
    print(f"{df['split'].value_counts()}")

    # –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ —Ç—Ä–∏ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞
    df_train = df[df.split == 'train'].copy()
    df_val = df[df.split == 'val'].copy()
    df_test = df[df.split == 'test'].copy()

    # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –∏–º–µ–Ω —Ñ–∞–π–ª–æ–≤
    train_file = data_path.replace(".csv", "_train_09.csv")
    val_file = data_path.replace(".csv", "_val_005.csv")
    test_file = data_path.replace(".csv", "_test_005.csv")

    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∞–π–ª–æ–≤
    df_train.to_csv(train_file, index=False)
    df_val.to_csv(val_file, index=False)
    df_test.to_csv(test_file, index=False)

    return df_train, df_val, df_test

In [6]:
data_path = 'cleared_v1.1.csv'
df_train_mansi_v1, df_val_mansi_v1, df_test_mansi_v1 = process_and_save_splits(data_path)

df_train_mansi_v2, df_val_mansi_v2, df_test_mansi_v2 = process_and_save_splits('cleared_v2.csv')

data.shape unprocessed: (80879, 2)
df.columns unprocessed: Index(['target', 'source'], dtype='object')
data.shape processed: (80879, 4)
df.columns processed: Index(['mansi', 'ru', 'ind', 'split'], dtype='object')
split
train    72943
val       4025
test      3911
Name: count, dtype: int64
data.shape unprocessed: (79933, 2)
df.columns unprocessed: Index(['target', 'source'], dtype='object')
data.shape processed: (79933, 4)
df.columns processed: Index(['mansi', 'ru', 'ind', 'split'], dtype='object')
split
train    72086
val       3973
test      3874
Name: count, dtype: int64


# –¢–µ—Å—Ç—ã —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞

In [None]:
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')

def word_tokenize(text):
    # a very naive word tokenizer for languages with English-like orthography
    return re.findall('(\w+|[^\w\s])', text)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

In [None]:
smpl = df_train_mansi.sample(10000, random_state=1)

smpl['rus_toks'] = smpl["ru"].apply(tokenizer.tokenize)
smpl['mansi_toks'] = smpl["mansi"].apply(tokenizer.tokenize)

smpl['rus_words'] = smpl["ru"].apply(word_tokenize)
smpl['mansi_words'] = smpl["mansi"].apply(word_tokenize)

In [None]:
smpl.sample(5)[['mansi', 'mansi_words', 'mansi_toks', 'ru', 'rus_words', 'rus_toks']]

In [None]:
stats = smpl[['rus_toks', 'mansi_toks', 'rus_words', 'mansi_words']].applymap(len).describe()
stats

In [None]:
print(stats.rus_toks['mean'] / stats.rus_words['mean'])
print(stats.mansi_toks['mean'] / stats.mansi_words['mean'])

In [None]:
print(tokenizer.unk_token, tokenizer.unk_token_id)

In [None]:
texts_with_unk = [text for text in tqdm(mansi_df.mansi) if tokenizer.unk_token_id in tokenizer(text).input_ids]
print(len(texts_with_unk))

In [None]:
s = random.sample(texts_with_unk, 5)
s

In [None]:
# this code is adapted from  the Stopes repo of the NLLB team
# https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L21

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]


def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace ùìïùîØùîûùî´ùî†ùî¢ùî∞ùî†ùîû by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [None]:
texts_with_unk_normed = [text for text in tqdm(texts_with_unk) if tokenizer.unk_token_id in tokenizer(preproc(text)).input_ids]
print(len(texts_with_unk_normed))

In [None]:
texts_with_unk_normed[:4]

In [66]:
# –ü—Ä–∏–º–µ—Ä–Ω—ã–π —Å–ø–∏—Å–æ–∫ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
sentences = [
    "–ü—Ä–∏–º–µ—Ä –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è —Å —Å–∏–º–≤–æ–ª–æ–º ”à.",
    "–ï—â–µ –æ–¥–Ω–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ —Å —Å–∏–º–≤–æ–ª–æ–º —ëÃÑ.",
    "–≠—Ç–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –±–µ–∑ –º–∞–Ω—Å–∏–π—Å–∫–∏—Ö —Å–∏–º–≤–æ–ª–æ–≤.",
    "–î–∞–Ω–Ω–æ–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –Ω–µ–æ–±—ã—á–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã: ”ï, ”°.",
    "–¢–µ—Å—Ç–æ–≤–æ–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –±–µ–∑ –æ—à–∏–±–æ–∫.",
    "”á–∞”à–µ—Ü –∏ ”á–∏–∫–∞ –ø–æ–¥–æ–±–Ω—ã —Ç–∏–ø–∏—á–Ω—ã–º —Å–ª–æ–≤–∞–º –º–∞–Ω—Å–∏–π—Å–∫–æ–≥–æ —è–∑—ã–∫–∞.",
    "–°–∏–º–≤–æ–ª —ãÃÑ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø—Ä–æ–±–ª–µ–º–Ω—ã–º –¥–ª—è —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞.",
    "–ß—Ç–æ –¥–µ–ª–∞—Ç—å —Å —Å–∏–º–≤–æ–ª–æ–º ”ß?",
    "–≠—Ç–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –¥–æ–ª–∂–Ω–æ —Ä–∞–±–æ—Ç–∞—Ç—å –Ω–æ—Ä–º–∞–ª—å–Ω–æ.",
    "–ü—Ä–æ–≤–µ—Ä–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ —Å–∏–º–≤–æ–ª–∞ ”±."
]

# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –∏ –ø–æ–∏—Å–∫–∞ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤
def check_tokenization(tokenizer, sentences):
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer(sentence).input_ids
        if tokenizer.unk_token_id in input_ids:
            unknown_tokens = [token for token, token_id in zip(tokens, input_ids) if token_id == tokenizer.unk_token_id]
            print(f"Sentence: {sentence}")
            print(f"Unknown tokens: {unknown_tokens}\n")
        else:
            print(f"Sentence: {sentence} is tokenized correctly.")

# –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ –≤–∞—à —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä NLLB
# –ü—Ä–∏–º–µ—Ä: tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
# tokenizer = ...  # –£–∫–∞–∂–∏—Ç–µ –≤–∞—à —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä

# –ó–∞–ø—É—Å–∫ –ø—Ä–æ–≤–µ—Ä–∫–∏
check_tokenization(tokenizer, sentences)


Sentence: –ü—Ä–∏–º–µ—Ä –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è —Å —Å–∏–º–≤–æ–ª–æ–º ”à.
Unknown tokens: ['.']

Sentence: –ï—â–µ –æ–¥–Ω–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ —Å —Å–∏–º–≤–æ–ª–æ–º —ëÃÑ. is tokenized correctly.
Sentence: –≠—Ç–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –±–µ–∑ –º–∞–Ω—Å–∏–π—Å–∫–∏—Ö —Å–∏–º–≤–æ–ª–æ–≤. is tokenized correctly.
Sentence: –î–∞–Ω–Ω–æ–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –Ω–µ–æ–±—ã—á–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã: ”ï, ”°.
Unknown tokens: [',', '.']

Sentence: –¢–µ—Å—Ç–æ–≤–æ–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –±–µ–∑ –æ—à–∏–±–æ–∫. is tokenized correctly.
Sentence: ”á–∞”à–µ—Ü –∏ ”á–∏–∫–∞ –ø–æ–¥–æ–±–Ω—ã —Ç–∏–ø–∏—á–Ω—ã–º —Å–ª–æ–≤–∞–º –º–∞–Ω—Å–∏–π—Å–∫–æ–≥–æ —è–∑—ã–∫–∞.
Unknown tokens: ['–∞', '–µ', '–∏']

Sentence: –°–∏–º–≤–æ–ª —ãÃÑ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø—Ä–æ–±–ª–µ–º–Ω—ã–º –¥–ª—è —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞. is tokenized correctly.
Sentence: –ß—Ç–æ –¥–µ–ª–∞—Ç—å —Å —Å–∏–º–≤–æ–ª–æ–º ”ß?
Unknown tokens: ['?']

Sentence: –≠—Ç–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –¥–æ–ª–∂–Ω–æ —Ä–∞–±–æ—Ç–∞—Ç—å –Ω–æ—Ä–º–∞–ª—å–Ω–æ. is tokenized correctly.
Sente

# Expanding the vocabulary

In [7]:
# this code is adapted from  the Stopes repo of the NLLB team
# https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L21

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]


def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace ùìïùîØùîûùî´ùî†ùî¢ùî∞ùî†ùîû by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [None]:
def cleanup():
    """Try to free GPU memory."""
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
from collections import Counter
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
from transformers.models.nllb.tokenization_nllb import FAIRSEQ_LANGUAGE_CODES
import json
import os
import shutil


def update_nllb_tokenizer(
    old_tokenizer: NllbTokenizer,
    new_spm_path: str,
    new_lang_codes: list[str],
) -> NllbTokenizer:
    """
    Create a new tokenizer for NLLB, with an updated sentencepiece model and some new language codes.
    In order to get rid of the old (and wrong) added token encoders/decoders, we save the tokenizer to disk and remove those files.
    :param old_tokenizer: the original tokenizer
    :param new_spm_path: path to the file with the sentncepiece model
    :param new_lang_codes: list of the new codes to add to the tokenizer
    :return: the new NllbTokenizer
    """
    TKN_DIR = "old_tokenizer"  # todo: make it a temp dir
    old_tokenizer.save_pretrained(TKN_DIR)

    with open(f"{TKN_DIR}/tokenizer_config.json", "r") as f:
        cfg = json.load(f)
    cfg["added_tokens_decoder"] = {
        k: v
        for k, v in cfg["added_tokens_decoder"].items()
        if k in ["0", "1", "2", "3"]
    }
    cfg["additional_special_tokens"] = []
    with open(f"{TKN_DIR}/tokenizer_config.json", "w") as f:
        json.dump(cfg, f, indent=2)
    # os.remove(f"{TKN_DIR}/tokenizer.json") # this one does not exist
    # this contains added tokens: language codes and mask
    os.remove(f"{TKN_DIR}/added_tokens.json")
    os.remove(f"{TKN_DIR}/special_tokens_map.json")
    os.remove(f"{TKN_DIR}/sentencepiece.bpe.model")
    shutil.copy(new_spm_path, f"{TKN_DIR}/sentencepiece.bpe.model")

    new_tokenizer = NllbTokenizer.from_pretrained(
        TKN_DIR,
        additional_special_tokens=sorted(FAIRSEQ_LANGUAGE_CODES + new_lang_codes),
    )
    return new_tokenizer


print("Creating corpus and counting chars in it")
all_texts = df_train_mansi_v2["mansi"].dropna().tolist()
all_text_normalized = [preproc(t) for t in tqdm(all_texts)]

chars_cnt = Counter(c for t in all_text_normalized for c in t)
required_chars = ''.join([
    k for k, v in chars_cnt.most_common() 
    if v >= 3 and k not in ' '
])

all_texts_file = 'mansi_texts_plain.txt'
SPM_PREFIX = 'spm_mansi_16k'
with open(all_texts_file, 'w') as f:
    for i, text in enumerate(all_texts):
        print(text, file=f)

print("Tokenizer training")
spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix=SPM_PREFIX,
    vocab_size=2**14,  # 16K
    character_coverage=1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192*4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=required_chars,
)


print("Adding missing tokens to NLLB tokenizer and saving result")
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
sp_trained = spm.SentencePieceProcessor(model_file=f'{SPM_PREFIX}.model')
added_spm = sp_pb2_model.ModelProto()
added_spm.ParseFromString(sp_trained.serialized_model_proto())
old_spm = sp_pb2_model.ModelProto()
old_spm.ParseFromString(tokenizer.sp_model.serialized_model_proto())

nllb_tokens_set = {p.piece for p in old_spm.pieces}
prev_min_score = old_spm.pieces[-1].score
for p in added_spm.pieces:
    piece = p.piece
    if p.type != 1:
        continue
    if piece not in nllb_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = p.score + prev_min_score
        old_spm.pieces.append(new_p)

NEW_SPM_NAME = 'spm_nllb_mansi_268k.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(old_spm.SerializeToString())


print("Reloading NLLB tokenizer and resizing model")
model_name = 'facebook/nllb-200-distilled-600M'
tokenizer_old = NllbTokenizer.from_pretrained(model_name)
# tokenizer = NllbTokenizer.from_pretrained(model_name, vocab_file=NEW_SPM_NAME)

tokenizer = update_nllb_tokenizer(tokenizer_old, NEW_SPM_NAME, new_lang_codes=["mansi_Cyrl"])

# Checking tokenizer updates
print(f"Tokenizer length after adding 'mansi_Cyrl': {len(tokenizer)}")


# Loading and resizing the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Re-initializing the new embeddings
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_old.get_vocab()))
for t in tqdm(added_vocab):
    tt = tokenizer_old(t, add_special_tokens=False).input_ids
    if len(tt) == 0:
        tt = [tokenizer_old.unk_token_id]
    idx = tokenizer.convert_tokens_to_ids(t)
    model.model.shared.weight.data[idx] = model.model.shared.weight.data[tt].mean(0)

In [9]:
print(f"len(tokenizer): {len(tokenizer)}")
print(f"mansi_Cyrl id: {tokenizer.convert_tokens_to_ids('mansi_Cyrl')}")
print(f"mansi_Cyrl token: {tokenizer.decode(tokenizer.convert_tokens_to_ids('mansi_Cyrl'))}")

mask_id = tokenizer.convert_tokens_to_ids("<mask>")
print(f"ID of <mask>: {mask_id}")


len(tokenizer): 270673
mansi_Cyrl id: 270587
mansi_Cyrl token: mansi_Cyrl
ID of <mask>: 270469


## optional checks

In [126]:
def check_truncation(df, trunk_len=128):
    long_sentences_count = 0
    for sentence in df['ru']:
        tokens = tokenizer(sentence, truncation=False)['input_ids']
        if len(tokens) > max_length:
            long_sentences_count += 1
    print(f"–û–±—Ä–µ–∂—É—Ç—Å—è ru: {long_sentences_count} –∏–∑ {len(df)}")

    long_sentences_count = 0
    for sentence in df['mansi']:
        tokens = tokenizer(sentence, truncation=False)['input_ids']
        if len(tokens) > max_length:
            long_sentences_count += 1
    print(f"–û–±—Ä–µ–∂—É—Ç—Å—è mansi: {long_sentences_count} –∏–∑ {len(df)}")

check_truncation(df_test_mansi)

–û–±—Ä–µ–∂—É—Ç—Å—è ru: 0 –∏–∑ 1963
–û–±—Ä–µ–∂—É—Ç—Å—è mansi: 0 –∏–∑ 1963


In [127]:
check_truncation(df_train_mansi)

–û–±—Ä–µ–∂—É—Ç—Å—è ru: 14 –∏–∑ 76968
–û–±—Ä–µ–∂—É—Ç—Å—è mansi: 14 –∏–∑ 76968


# The training loop

In [10]:
import random
import torch
import gc
import numpy as np
from transformers.optimization import get_constant_schedule_with_warmup
from transformers import Adafactor

def train_model(
    model,
    tokenizer,
    train_df,
    model_save_path,
    batch_size=16, 
    max_length=128,
    warmup_steps=1000,
    training_steps=57000,
    learning_rate=1e-4,
    weight_decay=1e-3,
    clip_threshold=1.0,
    langs=[('ru', 'rus_Cyrl'), ('mansi', 'mansi_Cyrl')],
):
    """Train the model and return the list of losses."""
    
    def cleanup():
        """Try to free GPU memory."""
        gc.collect()
        torch.cuda.empty_cache()

    def get_batch_pairs(batch_size, data=train_df):
        (l1, long1), (l2, long2) = random.sample(langs, 2)
        xx, yy = [], []
        for _ in range(batch_size):
            item = data.iloc[random.randint(0, len(data)-1)]
            xx.append(preproc(item[l1]))
            yy.append(preproc(item[l2]))
        return xx, yy, long1, long2

    # Initialize GPU and optimizer
    cleanup()
    model.cuda()

    optimizer = Adafactor(
        [p for p in model.parameters() if p.requires_grad],
        scale_parameter=False,
        relative_step=False,
        lr=learning_rate,
        clip_threshold=clip_threshold,
        weight_decay=weight_decay,
    )

    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    
    losses = []
    model.train()
    x, y, loss = None, None, None

    tq = trange(0, training_steps)

    for i in tq:
        xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
        try:
            tokenizer.src_lang = lang1
            x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
            tokenizer.src_lang = lang2
            y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
            y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

            loss = model(**x, labels=y.input_ids).loss
            loss.backward()
            losses.append(loss.item())

            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        except RuntimeError as e:
            optimizer.zero_grad(set_to_none=True)
            x, y, loss = None, None, None
            cleanup()
            print('error', max(len(s) for s in xx + yy), e)
            continue

        if i % 500 == 0:
            print(f'Step {i}: Average Loss (last 500 steps): {np.mean(losses[-500:])}')

        if i % 1000 == 0 and i > 0:
            model.save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)

    return model, tokenizer, losses


In [None]:
model3, tokenizer3, losses3 = train_model(model, tokenizer, df_train_mansi_v1, 'nllb-rus-mansi-v3', 
                                               training_steps=57001)

  0%|          | 0/57001 [00:00<?, ?it/s]

Step 0: Average Loss (last 500 steps): 9.058518409729004
Step 500: Average Loss (last 500 steps): 5.915882295131683


Non-default generation parameters: {'max_length': 200}


Step 1000: Average Loss (last 500 steps): 4.208458316802979


In [None]:
pd.Series(losses3).ewm(100).mean().plot();

In [None]:
model3_1, tokenizer3_1, losses3_1 = train_model(model3, tokenizer3, df_train_mansi_v1, 'nllb-rus-mansi-v3_1_80k_steps',
                                               training_steps=23001)

In [None]:
pd.Series(losses3_1).ewm(100).mean().plot();

In [None]:
model3_2, tokenizer3_2, losses3_2 = train_model(model3_1, tokenizer3_1, df_train_mansi, 'nllb-rus-mansi-v3_1_100k_steps',
                                               training_steps=20001)

In [None]:
pd.Series(model3_2).ewm(100).mean().plot();

## on v2 data

In [None]:
model4, tokenizer4, losses4 = train_model(model, tokenizer, df_train_mansi_v2, 'nllb-rus-mansi-v4', 
                                               training_steps=57001)

In [None]:
model4_1, tokenizer4_1, losses4_1 = train_model(model4, tokenizer4, df_train_mansi_v2, 'nllb-rus-mansi-v4_1_80k_steps',
                                               training_steps=23001)

In [None]:
model4_2, tokenizer4_2, losses4_2 = train_model(model4_1, tokenizer4_1, df_train_mansi_v2, 'nllb-rus-mansi-v3_1_100k_steps',
                                               training_steps=20001)

# Using the model

In [101]:
# need adopt
def batched_translate(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]


bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)  # this metric is called ChrF++\

In [219]:
import sacrebleu
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer

def evaluate_model(
    model,
    tokenizer,
    df,
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    column_lang_1="mansi",
    column_lang_2="ru",
    num_beams=4,
    a=32,
    b=3,
    max_input_length=1024,
):
    """
    Evaluates the model on the provided dataset, translating between two languages,
    and calculates BLEU and chrF metrics.
    
    Args:
        model: Pretrained sequence-to-sequence model.
        tokenizer: Corresponding tokenizer for the model.
        df: DataFrame with columns containing text in two languages.
        lang_1_code: Source language code for translation (default: 'mansi_Cyrl').
        lang_2_code: Target language code for translation (default: 'rus_Cyrl').
        column_lang_1: Column name in df corresponding to source language 1 (default: 'mansi').
        column_lang_2: Column name in df corresponding to source language 2 (default: 'ru').
        num_beams: Number of beams for beam search (default: 4).
        a: Multiplier for max_new_tokens (default: 32).
        b: Multiplier factor for max_new_tokens relative to input length (default: 3).
        max_input_length: Maximum input token length (default: 1024).
    
    Returns:
        df_t: DataFrame with added columns for translations and metric scores printed.
    """
    
    def translate(text, src_lang=lang_1_code, tgt_lang=lang_2_code):
        tokenizer.src_lang = src_lang
        tokenizer.tgt_lang = tgt_lang
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
        result = model.generate(
            **inputs.to(model.device),
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
            max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
            num_beams=num_beams
        )
        return tokenizer.batch_decode(result, skip_special_tokens=True)

    # Prepare BLEU and chrF calculators
    bleu_calc = sacrebleu.metrics.BLEU()
    chrf_calc = sacrebleu.metrics.CHRF()

    # Copy the DataFrame to avoid modification of the original one
    df_t = df.copy()

    # Translate from lang_1_code (e.g., Mansi) to lang_2_code (e.g., Russian)
    df_t[f'{column_lang_2}_translated'] = [translate(t, lang_1_code, lang_2_code)[0] for t in tqdm(df_t[column_lang_1])]
    
    # Translate from lang_2_code (e.g., Russian) to lang_1_code (e.g., Mansi)
    df_t[f'{column_lang_1}_translated'] = [translate(t, lang_2_code, lang_1_code)[0] for t in tqdm(df_t[column_lang_2])]

    # Convert references to list of lists
    references_lang_2 = [[ref] for ref in df_t[column_lang_2].tolist()]
    references_lang_1 = [[ref] for ref in df_t[column_lang_1].tolist()]

    # Calculate and print metrics for both translation directions
    print("Metrics for translation from", lang_1_code, "to", lang_2_code)
    print(bleu_calc.corpus_score(df_t[f'{column_lang_2}_translated'].tolist(), references_lang_2))
    print(chrf_calc.corpus_score(df_t[f'{column_lang_2}_translated'].tolist(), references_lang_2))
    
    print("\nMetrics for translation from", lang_2_code, "to", lang_1_code)
    print(bleu_calc.corpus_score(df_t[f'{column_lang_1}_translated'].tolist(), references_lang_1))
    print(chrf_calc.corpus_score(df_t[f'{column_lang_1}_translated'].tolist(), references_lang_1))

    return df_t


In [175]:
df_with_metrics = evaluate_model(
    model2,
    tokenizer2,
    df_test_mansi,
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    num_beams=4
)

  0%|          | 0/1963 [00:00<?, ?it/s]

  0%|          | 0/1963 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 10.13 83.3/9.1/5.0/2.8 (BP = 1.000 ratio = 1.000 hyp_len = 12 ref_len = 12)
chrF2 = 34.08

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 9.82 71.4/8.3/5.0/3.1 (BP = 1.000 ratio = 1.000 hyp_len = 7 ref_len = 7)
chrF2 = 27.65


In [177]:
df_with_metrics.head(2)

Unnamed: 0,mansi,ru,ind,split,lang_2_translated,lang_1_translated
1,"–•–∞ÃÑ–π—Ç—ã–º–∞—Ç—ç —Ç”Ø—Ä –≤–∞ÃÑ—Ç–∞–Ω —ë—Ö—Ç—ã—Å, –≤–∏—Ç –≤–∞ÃÑ—Ç–∞–Ω —Ö–∞ÃÑ–π—Ç—ã—Å.","–ë–µ–≥–∞—è –∫ –±–µ—Ä–µ–≥—É –æ–∑–µ—Ä–∞ –ø—Ä–∏—à–ª–∞, –∫ –≤–æ–¥–µ –ø–æ–¥–±–µ–∂–∞–ª–∞.",1,test,"–ë–µ–∂–∏—Ç, –Ω–∞ –±–µ—Ä–µ–≥ –æ–∑–µ—Ä–∞ –ø—Ä–∏–±–µ–∂–∞–ª, –Ω–∞ –±–µ—Ä–µ–≥ –≤–æ–¥—ã ...","–¢”Ø—Ä –≤–∞ÃÑ—Ç–∞–Ω —Ö–∞ÃÑ–π—Ç—ã—Å,."
11,"–•–∞ÃÑ–π—Ç—ã–º —Ç–∞ –Ω–æ–º—Å—ã, –æ—Å –º–∞–Ω—Ä—ã–≥ —Å–∞—Ö–∏–º —Ç–∞—Ç–µ–º –º–∞—Ä—É–º—ã...","–ë–µ–∂–∏—Ç –∏ –¥—É–º–∞–µ—Ç, –ø–æ—á–µ–º—É –∂–µ —à—É–±–∞ —Ç–∞–∫–∞—è –º–∞–ª–∞—è —Å—Ç–∞–ª–∞.",11,test,"–ë–µ–∂–∏—Ç –∏ –¥—É–º–∞–µ—Ç, –ø–æ—á–µ–º—É —à—É–±–∫–∞ —Ç–∞–∫ —É—Å—Ç–∞–ª–∞.","–•–∞ÃÑ–π—Ç—ã–º —Ç–∞ –Ω–æ–º—Å—ã, –º–∞–Ω—Ä—ã–≥ —Å–∞—Ö–∏ —Ç–∞–∫–µ–º–Ω–∏–≥ ƒì."


In [187]:
df_with_metrics = evaluate_model(
    model2_1,
    tokenizer2_1,
    df_test_mansi,
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    num_beams=4
)

  0%|          | 0/1963 [00:00<?, ?it/s]

  0%|          | 0/1963 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 14.37 81.8/30.0/5.6/3.1 (BP = 1.000 ratio = 1.000 hyp_len = 11 ref_len = 11)


  0%|          | 0/10 [2:18:35<?, ?it/s]


chrF2 = 21.41

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 35.93 100.0/40.0/25.0/16.7 (BP = 1.000 ratio = 1.000 hyp_len = 6 ref_len = 6)
chrF2 = 29.57


In [188]:
df_with_metrics = evaluate_model(
    model2_1,
    tokenizer2_1,
    df_dev_mansi,
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    num_beams=4
)

  0%|          | 0/1948 [00:00<?, ?it/s]

  0%|          | 0/1948 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 100.00 100.0/100.0/100.0/100.0 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)
chrF2 = 100.00

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 0.00 33.3/25.0/25.0/0.0 (BP = 1.000 ratio = 1.000 hyp_len = 3 ref_len = 3)
chrF2 = 38.63


In [223]:
import pandas as pd

df_all_test = pd.concat([df_test_mansi, df_dev_mansi], axis=0, ignore_index=True)
print(df_all_test.shape)
df_all_test.head(2)

(3911, 4)


Unnamed: 0,mansi,ru,ind,split
0,"–•–∞ÃÑ–π—Ç—ã–º–∞—Ç—ç —Ç”Ø—Ä –≤–∞ÃÑ—Ç–∞–Ω —ë—Ö—Ç—ã—Å, –≤–∏—Ç –≤–∞ÃÑ—Ç–∞–Ω —Ö–∞ÃÑ–π—Ç—ã—Å.","–ë–µ–≥–∞—è –∫ –±–µ—Ä–µ–≥—É –æ–∑–µ—Ä–∞ –ø—Ä–∏—à–ª–∞, –∫ –≤–æ–¥–µ –ø–æ–¥–±–µ–∂–∞–ª–∞.",1,test
1,"–•–∞ÃÑ–π—Ç—ã–º —Ç–∞ –Ω–æ–º—Å—ã, –æ—Å –º–∞–Ω—Ä—ã–≥ —Å–∞—Ö–∏–º —Ç–∞—Ç–µ–º –º–∞—Ä—É–º—ã...","–ë–µ–∂–∏—Ç –∏ –¥—É–º–∞–µ—Ç, –ø–æ—á–µ–º—É –∂–µ —à—É–±–∞ —Ç–∞–∫–∞—è –º–∞–ª–∞—è —Å—Ç–∞–ª–∞.",11,test


In [224]:
df_with_metrics = evaluate_model(
    model2_1,
    tokenizer2_1,
    df_all_test,
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    num_beams=4
)

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 21.93 91.7/45.5/10.0/5.6 (BP = 1.000 ratio = 1.000 hyp_len = 12 ref_len = 12)
chrF2 = 44.58

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 17.97 50.0/20.0/12.5/8.3 (BP = 1.000 ratio = 1.000 hyp_len = 6 ref_len = 6)
chrF2 = 18.69


In [194]:
df_with_metrics = evaluate_model(
    model2_1,
    tokenizer2_1,
    df_dev_mansi[:10],
    lang_1_code='mansi_Cyrl',
    lang_2_code='rus_Cyrl',
    num_beams=4
)
df_with_metrics

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 42.73 80.0/50.0/33.3/25.0 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)
chrF2 = 88.62

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 0.00 100.0/25.0/25.0/0.0 (BP = 0.717 ratio = 0.750 hyp_len = 3 ref_len = 4)
chrF2 = 43.87


Unnamed: 0,mansi,ru,ind,split,ru_translated,mansi_translated
69,–ûÃÑ–≤–ª–∞—Ö –ø–∞–ª—è–≥–µ —Ö–æ—Å–∞ÃÑ–≥.,–£ –û–≤–ª–∞—Ö–∞ –¥–ª–∏–Ω–Ω—ã–µ —É—à–∏.,69,dev,–£ –ûÃÑ–≤–ª–∞—Ö–∞ –¥–ª–∏–Ω–Ω—ã–µ —É—à–∏.,–ûÃÑ–≤–ª–∞—Ö –ø–∞–ª—å.
154,–ü—ãÃÑ–≥—Ä–∏—Å—å –±–∏–±–ª–∏–æ—Ç–µ–∫–∞–Ω –º–∏–Ω–∏.,–ú–∞–ª—å—á–∏–∫ –∏–¥—ë—Ç –≤ –±–∏–±–ª–∏–æ—Ç–µ–∫—É.,154,dev,–ú–∞–ª—å—á–∏–∫ –ø–æ–π–¥–µ—Ç –≤ –±–∏–±–ª–∏–æ—Ç–µ–∫—É.,–ü—ã–≥—Ä–∏—Å—å –ª–æ–≤–∏–Ω—å—Ç–∞–Ω –∫–æ–ª–Ω –º–∏–Ω–∏.
261,–ú–∞–ª—Ç—ã–ø–Ω—É–≤–µ –ø–∞–ª—å –≤–æ—Ä—ã—Ç —É–∏—Ç—ã–Ω,–ó–≤–µ—Ä—è–º —Å—Ç–∞–Ω–æ–≤–∏—Ç—å—Å—è —Ç–µ–ø–ª–µ–µ –≤ –ª–µ—Å—É.,261,dev,–ú–µ–¥–ª–µ–Ω–Ω–µ–µ —É—Ö–æ–º –∑–≤–µ—Ä–∏ –≤ –ª–µ—Å—É.,–í–æÃÑ—Ä”Ø–π—Ö—É–ª—ã—Ç–Ω —Äƒì–≥—ã”à–Ω—É–≤–µ–≥.
305,"–£–≤—Å—å—Ç–µ–Ω –∞–∫–≤—Ç–æ—Ö —Ç–∞ –∫–æ—Å —Ö–∞–Ω–∏—Å—å—Ç–∞—Å–ª—É–º, –ª–∞ÃÑ—Ç—ã”à –∞—Ç ...","–¢–≤–æ—é —Å—Ç–∞—Ä—à—É—é —Å–µ—Å—Ç—Ä–∏—á–∫—É —É—á–∏–ª-—É—á–∏–ª —è, –æ–Ω–∞ –Ω–µ –ø–æ—Å...",305,dev,"–Ø —Ç–∞–∫ —É—á–∏–ª —Ç–≤–æ–µ–≥–æ —Å—Ç–∞—Ä—à–µ–≥–æ –±—Ä–∞—Ç–∞, –Ω–æ –æ–Ω –Ω–µ –ø–æ—Å...","–£–≤—Å—å—Ç–µ–Ω —Ç–∞ –∫–æ—Å —Ö–∞–Ω—å—Å—å—Ç–∞—Å–ª—É–º, –∞—Ç —Ö”Ø–Ω—Ç–ª–∞—Å."
390,"""–ú–∞–Ω—ã—Ä –∞–∫–≤–∞–≥ —è–ª–∞—Å—çÃÑ–≥—ã–Ω?""","""–ß–µ–≥–æ –ø–æ–≤—Å—é–¥—É –±—Ä–æ–¥–∏—à—å?""",390,dev,"""–ê —á—Ç–æ —Ç—ã –≤—Å–µ –≤—Ä–µ–º—è —Ö–æ–¥–∏—à—å?""","""–ú–∞–Ω—ã—Ä —Ç—ã–≥–ª–µ-—Ç—É–≤–ª–µ —è–ª–∞—Å–∞—Å—ã–Ω?"""
441,–ú–æÃÑ–ª—Ö–æÃÑ—Ç–∞–ª –º–∞ÃÑ–Ω —Ü–∏—Ä–∫—ã—Ç —Å—è—Ä —Ç–∞–ª–∫–≤–∞ —Ö—É–º—Ä–∏—Å—å –≤–∞ÃÑ—Å...,–í—á–µ—Ä–∞ –º—ã –≤–∏–¥–µ–ª–∏ –≤ —Ü–∏—Ä–∫–µ –Ω–µ–æ–±—ã–∫–Ω–æ–≤–µ–Ω–Ω–æ –º–∞–ª–µ–Ω—å–∫–æ...,441,dev,–í—á–µ—Ä–∞ –≤ –Ω–∞—à–µ–º —Ü–∏—Ä–∫–µ —Å–æ–≤—Å–µ–º –º–∞–ª–æ –º—É–∂—á–∏–Ω –≤–∏–¥–µ–ª–∏ ...,–ú–æ–ª—Ö–æÃÑ—Ç–∞–ª —Ü–∏—Ä–∫—ã—Ç —Å—É–Ω—Å—ã–≥–ª–∞—Ö—Ç–∞—Å—É–≤ —çÃÑ–ª—É–º—Ö–æÃÑ–ª–∞—Å —Ç–∏...
446,"–ê–º –æÃÑ–ª–Ω—ç –∫–æ–ª–Ω–∞–∫—É–º —Ö–æ—Å—ã—Ç—ç –∞—Ç –ºƒì—Ç—Ä–∞, —Ç–∞–≤ –ø–∞ÃÑ”à—Ö–≤–∏...","–î–ª–∏–Ω–∞ –º–æ–µ–π –∫–æ–º–Ω–∞—Ç—ã –ø—è—Ç—å –º–µ—Ç—Ä–æ–≤, —à–∏—Ä–∏–Ω–∞ –µ—ë —Ç—Ä–∏ ...",446,dev,"–ú–æ—è –∫–æ–º–Ω–∞—Ç–∫–∞ –¥–ª–∏–Ω–∞ –µ–≥–æ –ø—è—Ç—å –º–µ—Ç—Ä–æ–≤, –¥–ª–∏–Ω–∞ –µ–≥–æ ...","–ö–∞–Ω–∏–º—É–º –ø–∞–ª—ã—Ç—ç –∞—Ç –º–µÃÑ—Ç—Ä–∞, –ø–∞ÃÑ”à—Ö–≤–∏—Ç—ç —Ö—É—Ä—É–º –º–µÃÑ—Ç..."
475,–úƒì–Ω –∞—â—ë–π–∫–∞–º–µ–Ω –º–∞ÃÑ—Ç—É–º –æÃÑ–π–∫–∞.,–ù–∞—à –¥–µ–¥—É—à–∫–∞ —Å—Ç–∞—Ä—ã–π.,475,dev,–ù–∞—à –¥–µ–¥—É—à–∫–∞ –ø–æ –æ—Ç—Ü–æ–≤—Å–∫–æ–π –ª–∏–Ω–∏–∏ –ø–æ–∂–∏–ª–æ–π –º—É–∂—á–∏–Ω–∞.,–ú–∞ÃÑ–Ω –∞—Å—ë–π–∫–∞–≤ –º–∞—Ç—É–º.
499,–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∞—Ç—ã—è–Ω—ç.,–î—Ä—É–≥ –ø–æ–Ω–æ–º–∞—Ä—å —Å–∞–º –∏—Ö –ø–æ—ë—Ç.,499,dev,–¥—Ä—É–≥ –ø–æ–Ω–æ–º–∞—Ä—å —Å–∞–º –µ—ë –ø–æ—ë—Ç.,–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∏—è–Ω—ç.
518,–û–º–∞–º –ª–∞ÃÑ–ø–∫–∞—Ç —èÃÑ”à–∫ —Ç–æÃÑ—Ä —ëÃÑ–≤—Ç—ã—Å.,–ú–∞–º–∞ –∫—É–ø–∏–ª–∞ –≤ –º–∞–≥–∞–∑–∏–Ω–µ –±–µ–ª—ã–π –ø–ª–∞—Ç–æ–∫.,518,dev,–ú–æ—è –º–∞–º–∞ –≤ –º–∞–≥–∞–∑–∏–Ω–µ –∫—É–ø–∏–ª–∞ –±–µ–ª—ã–π –ø–ª–∞—Ç–æ–∫.,–û–º–∞–º –ª–∞ÃÑ–ø–∫–∞—Ç –ø—É”à–∫—Ç–æÃÑ—Ä—ã–ª —ë–≤—Ç—ã—Å.


## check quality

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Metrics for translation from mansi_Cyrl to rus_Cyrl
BLEU = 22.96 66.7/40.0/12.5/8.3 (BP = 1.000 ratio = 1.000 hyp_len = 6 ref_len = 6)
chrF2 = 70.80

Metrics for translation from rus_Cyrl to mansi_Cyrl
BLEU = 42.73 80.0/50.0/33.3/25.0 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)
chrF2 = 83.94


Unnamed: 0,mansi,ru,ind,split,ru_translated,mansi_translated
499,–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∞—Ç—ã—è–Ω—ç.,–î—Ä—É–≥ –ø–æ–Ω–æ–º–∞—Ä—å —Å–∞–º –∏—Ö –ø–æ—ë—Ç.,499,dev,–¥—Ä—É–≥ –ø–æ–Ω–æ–º–∞—Ä—å —Å–∞–º –µ—ë –ø–æ—ë—Ç.,–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∞–Ω—ç.


In [214]:
df_with_metrics['mansi_translated'].tolist()

['–ûÃÑ–≤–ª–∞—Ö –ø–∞–ª—å.',
 '–ü—ã–≥—Ä–∏—Å—å –ª–æ–≤–∏–Ω—å—Ç–∞–Ω –∫–æ–ª–Ω –º–∏–Ω–∏.',
 '–í–æÃÑ—Ä”Ø–π—Ö—É–ª—ã—Ç–Ω —Äƒì–≥—ã”à–Ω—É–≤–µ–≥.',
 '–£–≤—Å—å—Ç–µ–Ω —Ç–∞ –∫–æ—Å —Ö–∞–Ω—å—Å—å—Ç–∞—Å–ª—É–º, –∞—Ç —Ö”Ø–Ω—Ç–ª–∞—Å.',
 '"–ú–∞–Ω—ã—Ä —Ç—ã–≥–ª–µ-—Ç—É–≤–ª–µ —è–ª–∞—Å–∞—Å—ã–Ω?"',
 '–ú–æ–ª—Ö–æÃÑ—Ç–∞–ª —Ü–∏—Ä–∫—ã—Ç —Å—É–Ω—Å—ã–≥–ª–∞—Ö—Ç–∞—Å—É–≤ —çÃÑ–ª—É–º—Ö–æÃÑ–ª–∞—Å —Ç–∏ –ª–∏–ª–∏–ø—É—Ç.',
 '–ö–∞–Ω–∏–º—É–º –ø–∞–ª—ã—Ç—ç –∞—Ç –º–µÃÑ—Ç—Ä–∞, –ø–∞ÃÑ”à—Ö–≤–∏—Ç—ç —Ö—É—Ä—É–º –º–µÃÑ—Ç—Ä–∞, –∫–∞—Ä—Å—ã—Ç—ç –º–µÃÑ—Ç—Ä–∞.',
 '–ú–∞ÃÑ–Ω –∞—Å—ë–π–∫–∞–≤ –º–∞—Ç—É–º.',
 '–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∏—è–Ω—ç.',
 '–û–º–∞–º –ª–∞ÃÑ–ø–∫–∞—Ç –ø—É”à–∫—Ç–æÃÑ—Ä—ã–ª —ë–≤—Ç—ã—Å.']

In [215]:
df_with_metrics["mansi"].tolist()

['–ûÃÑ–≤–ª–∞—Ö –ø–∞–ª—è–≥–µ —Ö–æ—Å–∞ÃÑ–≥.',
 '–ü—ãÃÑ–≥—Ä–∏—Å—å –±–∏–±–ª–∏–æ—Ç–µ–∫–∞–Ω –º–∏–Ω–∏.',
 '–ú–∞–ª—Ç—ã–ø–Ω—É–≤–µ  –ø–∞–ª—å –≤–æ—Ä—ã—Ç —É–∏—Ç—ã–Ω',
 '–£–≤—Å—å—Ç–µ–Ω –∞–∫–≤—Ç–æ—Ö —Ç–∞ –∫–æ—Å —Ö–∞–Ω–∏—Å—å—Ç–∞—Å–ª—É–º, –ª–∞ÃÑ—Ç—ã”à –∞—Ç —Ö”Ø–Ω—Ç–ª–∞—Å.',
 '"–ú–∞–Ω—ã—Ä –∞–∫–≤–∞–≥ —è–ª–∞—Å—çÃÑ–≥—ã–Ω?"',
 '–ú–æÃÑ–ª—Ö–æÃÑ—Ç–∞–ª –º–∞ÃÑ–Ω —Ü–∏—Ä–∫—ã—Ç —Å—è—Ä —Ç–∞–ª–∫–≤–∞ —Ö—É–º—Ä–∏—Å—å –≤–∞ÃÑ—Å—É–≤ ‚Äì —Ç—ã –ª–∏–ª–∏–ø—É—Ç –æÃÑ–ª—ã—Å.',
 '–ê–º –æÃÑ–ª–Ω—ç –∫–æ–ª–Ω–∞–∫—É–º —Ö–æ—Å—ã—Ç—ç –∞—Ç –ºƒì—Ç—Ä–∞, —Ç–∞–≤ –ø–∞ÃÑ”à—Ö–≤–∏—Ç—ç —Ö”Ø—Ä—É–º –ºƒì—Ç—Ä–∞, —Ç–∞–≤ –ª—éÃÑ–ª–∏—Ç–µ —Ö”Ø—Ä—É–º –ºƒì—Ç—Ä–∞.',
 '–úƒì–Ω –∞—â—ë–π–∫–∞–º–µ–Ω –º–∞ÃÑ—Ç—É–º –æÃÑ–π–∫–∞.',
 '–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∞—Ç—ã—è–Ω—ç.',
 '–û–º–∞–º –ª–∞ÃÑ–ø–∫–∞—Ç —èÃÑ”à–∫ —Ç–æÃÑ—Ä —ëÃÑ–≤—Ç—ã—Å.']

In [216]:
bleu_calc = sacrebleu.metrics.BLEU()
print(bleu_calc.corpus_score(df_with_metrics['mansi_translated'].tolist(), [df_with_metrics["mansi"].tolist()]))

BLEU = 7.61 53.4/18.8/5.3/1.8 (BP = 0.772 ratio = 0.795 hyp_len = 58 ref_len = 73)


In [208]:
print(df_with_metrics['mansi_translated'][8:9].tolist())
print(df_with_metrics["mansi"][8:9].tolist())
bleu_calc = sacrebleu.metrics.BLEU()
print(bleu_calc.corpus_score(df_with_metrics['mansi_translated'][8:9].tolist(), [df_with_metrics["mansi"][8:9].tolist()]))

['–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∏—è–Ω—ç.']
['–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥–∞—Ç—ã—è–Ω—ç.']
BLEU = 42.73 80.0/50.0/33.3/25.0 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)


In [205]:
"–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥" == "–ü–æ–Ω–æ–º–∞—Ä–∏ —Ä—É–º–∞–∫–µ–º —Ç–∞–∫–≤–∏ –ª—É–π–≥"

True