In [1]:
import random

import numpy as np
import pandas as pd

import torch

torch.cuda.empty_cache()

def seed_all(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(seed)

In [2]:
# –§–∏–∫—Å–∏—Ä—É–µ–º random seed
SEED = 42
seed_all(SEED)

In [3]:
from pathlib import Path

DATA_PATH = Path('../data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = DATA_PATH / Path('cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

import pandas as pd

pd.set_option('display.max_colwidth', 500) 

In [4]:
import sys
import os

project_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_path)

In [5]:
from app.models.translate_model import TranslationModel

In [6]:
model = TranslationModel(
    model_name="facebook/m2m100_418M",
    # model_name="facebook/m2m100_1.2B",
    cache_dir=DATA_CACHE,
    device="cpu"
)

In [7]:
from pathlib import Path
import json

long_text_path = Path("long_text.json")

with open(long_text_path, "r", encoding="utf-8") as f:
    data = json.load(f)

texts = data["texts"]
source_lang = data["source_lang"]
target_langs = data["target_langs"]
data

{'texts': ['–ì–ª–∞–≤–∞ 1. –ü—Ä–∏–∑—ã–≤ –∫ –±–∏—Ç–≤–µ\n\n–ú–∏—Ä –¥—Ä–µ–≤–Ω–æ—Å—Ç–∏ –±—ã–ª –ø–æ–ª–æ–Ω —á—É–¥–µ—Å –∏ —á—É–¥–æ–≤–∏—â. –í —Ç–µ –¥–Ω–∏ –Ω–∞ –∑–µ–º–ª–µ –≤–µ–ª–∏–∫–∏—Ö –ø—Ä–∞–≤–∏—Ç–µ–ª–µ–π –∏ –ª–µ–≥–µ–Ω–¥–∞—Ä–Ω—ã—Ö –≥–µ—Ä–æ–µ–≤, —Å—Ä–∞–∂–µ–Ω–∏—è –Ω–µ –æ–≥—Ä–∞–Ω–∏—á–∏–≤–∞–ª–∏—Å—å —Ç–æ–ª—å–∫–æ –æ—Ä—É–∂–∏–µ–º –∏ –∞—Ä–º–∏—è–º–∏. –û–Ω–∏ –±—ã–ª–∏ –∏—Å–ø—ã—Ç–∞–Ω–∏—è–º–∏ —Å–∏–ª—ã –¥—É—Ö–∞, –º—É–∂–µ—Å—Ç–≤–∞ –∏ —É–º–∞. –û–¥–Ω–∏–º –∏–∑ —Ç–∞–∫–∏—Ö –∏—Å–ø—ã—Ç–∞–Ω–∏–π —Å—Ç–∞–ª–∏ –≥–æ–Ω–∫–∏ –Ω–∞ –∫–æ–ª–µ—Å–Ω–∏—Ü–∞—Ö, —á—Ç–æ —Å—Ç–∞–ª–æ –Ω–µ–≤–µ—Ä–æ—è—Ç–Ω–æ –ø–æ–ø—É–ª—è—Ä–Ω—ã–º –≤ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —É–≥–æ–ª–∫–∞—Ö –º–∏—Ä–∞, –∏ –¥–∞–∂–µ –≤ –ò—É–¥–µ–µ, –≥–¥–µ —Å–æ–±—Ä–∞–ª–∏—Å—å –≤–µ–ª–∏–∫–∏–µ –≥–µ—Ä–æ–∏ –ë–∏–±–ª–∏–∏.\n\n–≠—Ç–∏ –≥–æ–Ω–∫–∏, –æ–¥–Ω–∞–∫–æ, –Ω–µ –±—ã–ª–∏ –ø—Ä–æ—Å—Ç—ã–º–∏. –û–Ω–∏ –Ω–µ –±—ã–ª–∏ –ª–∏—à—å –∑–∞–±–∞–≤–∞ —Ä–∞–¥–∏. –ö–∞–∂–¥–∞—è –∫–æ–ª–µ—Å–Ω–∏—Ü–∞ –æ–ª–∏—Ü–µ—Ç–≤–æ—Ä—è–ª–∞ –Ω–µ —Ç–æ–ª—å–∫–æ –≤–æ–µ–Ω–Ω—É—é –º–æ—â—å, –Ω–æ –∏ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π –±–æ–π, –±–æ—Ä—å–±—É —Å —Å—É–¥—å–±

In [8]:
text = texts[0]
target_lang = target_langs[0]

In [9]:
blocks = model.split_text_to_blocks(
    text=text,
    source_lang=source_lang,
    max_tokens=128,
    buffer=8
)
blocks

['–ì–ª–∞–≤–∞ 1.',
 '–ü—Ä–∏–∑—ã–≤ –∫ –±–∏—Ç–≤–µ –ú–∏—Ä –¥—Ä–µ–≤–Ω–æ—Å—Ç–∏ –±—ã–ª –ø–æ–ª–æ–Ω —á—É–¥–µ—Å –∏ —á—É–¥–æ–≤–∏—â.',
 '–í —Ç–µ –¥–Ω–∏ –Ω–∞ –∑–µ–º–ª–µ –≤–µ–ª–∏–∫–∏—Ö –ø—Ä–∞–≤–∏—Ç–µ–ª–µ–π –∏ –ª–µ–≥–µ–Ω–¥–∞—Ä–Ω—ã—Ö –≥–µ—Ä–æ–µ–≤, —Å—Ä–∞–∂–µ–Ω–∏—è –Ω–µ –æ–≥—Ä–∞–Ω–∏—á–∏–≤–∞–ª–∏—Å—å —Ç–æ–ª—å–∫–æ –æ—Ä—É–∂–∏–µ–º –∏ –∞—Ä–º–∏—è–º–∏.',
 '–û–Ω–∏ –±—ã–ª–∏ –∏—Å–ø—ã—Ç–∞–Ω–∏—è–º–∏ —Å–∏–ª—ã –¥—É—Ö–∞, –º—É–∂–µ—Å—Ç–≤–∞ –∏ —É–º–∞.',
 '–û–¥–Ω–∏–º –∏–∑ —Ç–∞–∫–∏—Ö –∏—Å–ø—ã—Ç–∞–Ω–∏–π —Å—Ç–∞–ª–∏ –≥–æ–Ω–∫–∏ –Ω–∞ –∫–æ–ª–µ—Å–Ω–∏—Ü–∞—Ö, —á—Ç–æ —Å—Ç–∞–ª–æ –Ω–µ–≤–µ—Ä–æ—è—Ç–Ω–æ –ø–æ–ø—É–ª—è—Ä–Ω—ã–º –≤ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —É–≥–æ–ª–∫–∞—Ö –º–∏—Ä–∞, –∏ –¥–∞–∂–µ –≤ –ò—É–¥–µ–µ, –≥–¥–µ —Å–æ–±—Ä–∞–ª–∏—Å—å –≤–µ–ª–∏–∫–∏–µ –≥–µ—Ä–æ–∏ –ë–∏–±–ª–∏–∏.',
 '–≠—Ç–∏ –≥–æ–Ω–∫–∏, –æ–¥–Ω–∞–∫–æ, –Ω–µ –±—ã–ª–∏ –ø—Ä–æ—Å—Ç—ã–º–∏.',
 '–û–Ω–∏ –Ω–µ –±—ã–ª–∏ –ª–∏—à—å –∑–∞–±–∞–≤–∞ —Ä–∞–¥–∏.',
 '–ö–∞–∂–¥–∞—è –∫–æ–ª–µ—Å–Ω–∏—Ü–∞ –æ–ª–∏—Ü–µ—Ç–≤–æ—Ä—è–ª–∞ –Ω–µ —Ç–æ–ª—å–∫–æ –≤–æ–µ–Ω–Ω—É—é –º–æ—â—å, –Ω–æ –∏ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π –±–æ–π, –±–æ—Ä—å–±—É —

In [10]:
blocks[11]

'–ú–æ–∏—Å–µ–π –∏ –µ–≥–æ –∫–æ–ª–µ—Å–Ω–∏—Ü–∞ –ú–æ–∏—Å–µ–π, –≤–µ–ª–∏–∫–∏–π –≤–æ–∂–¥—å –∏ –ø—Ä–æ—Ä–æ–∫, –±—ã–ª –Ω–∞ —Å—Ç–∞—Ä–æ—Å—Ç–∏ –ª–µ—Ç, –Ω–æ –≤ –µ–≥–æ –≥–ª–∞–∑–∞—Ö –≥–æ—Ä–µ–ª –æ–≥–æ–Ω—å –Ω–µ–ø–æ–∫–æ—Ä–Ω–æ–≥–æ –¥—É—Ö–∞.'

In [11]:
model_inputs = model.tokenize(
    blocks[11], 
    source_lang, 
    max_length=128,
)
model_inputs['input_ids'].shape, model_inputs['attention_mask'].shape

(torch.Size([1, 45]), torch.Size([1, 45]))

In [12]:
decoded_text = model.tokenizer.decode(
    model_inputs["input_ids"][0],
    skip_special_tokens=False
)
print(decoded_text)

__ru__ –ú–æ–∏—Å–µ–π –∏ –µ–≥–æ –∫–æ–ª–µ—Å–Ω–∏—Ü–∞ –ú–æ–∏—Å–µ–π, –≤–µ–ª–∏–∫–∏–π –≤–æ–∂–¥—å –∏ –ø—Ä–æ—Ä–æ–∫, –±—ã–ª –Ω–∞ —Å—Ç–∞—Ä–æ—Å—Ç–∏ –ª–µ—Ç, –Ω–æ –≤ –µ–≥–æ –≥–ª–∞–∑–∞—Ö –≥–æ—Ä–µ–ª –æ–≥–æ–Ω—å –Ω–µ–ø–æ–∫–æ—Ä–Ω–æ–≥–æ –¥—É—Ö–∞.</s>


In [13]:
generated_tokens = model.model.generate(
    **model_inputs,
    forced_bos_token_id=model.tokenizer.lang_code_to_id['en'],
    max_length=256,
    num_beams=5,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    temperature=1.0,
    do_sample=False,
    early_stopping=True,
    # length_penalty=1.0,
    no_repeat_ngram_size=3,
)




In [14]:
# –î–µ–∫–æ–¥–∏—Ä—É–µ–º —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã
generated_text = model.tokenizer.decode(
    generated_tokens[0],  # –ë–µ—Ä–µ–º –ø–µ—Ä–≤—ã–π —ç–ª–µ–º–µ–Ω—Ç, –µ—Å–ª–∏ –≤—ã–≤–æ–¥ –º–Ω–æ–≥–æ–º–µ—Ä–Ω—ã–π
    skip_special_tokens=True  # –û–±—ã—á–Ω–æ –ª—É—á—à–µ –ø—Ä–æ–ø—É—Å–∫–∞—Ç—å —Å–øw–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã
)

generated_text

'Moses and his wheelchair Moses, the great leader and prophet, were old years old, but in his eyes a fire of unacceptable spirit burned.'

In [15]:
translate = model._translate(
    text=blocks[11],
    source_lang=source_lang,
    target_lang='en',
    # repetition_penalty=1.0,
    # temperature=0.1,
    # input_max_length=128,
    # output_max_length=256,
    # num_beams=5,
    # early_stopping=False,
    # do_sample=True,
    # length_penalty=1.0,
)
translate

'Moses and his wardrobe Moses, the great leader and prophet, were old years old, but in his eyes a fire of the unrighteous spirit burned.'

In [16]:
result = model.translate(
    text=blocks[11],
    source_lang=source_lang,
    target_lang='en'
)
result

'Moses and his wardrobe Moses, the great leader and prophet, were old years old, but in his eyes a fire of the unrighteous spirit burned.'

In [17]:
# results = model.translate_batch(
#     texts=texts,
#     source_lang=source_lang,
#     target_langs=target_langs
# )

# for i, result in enumerate(results):
#     print(f"\nüîπ –¢–µ–∫—Å—Ç {i+1}")
#     for lang, translation in result.items():
#         print(f"[{lang}] ({len(translation)} —Å–∏–º–≤–æ–ª–æ–≤):\n{translation[:1000]}...")  # –ø–µ—Ä–≤—ã–µ 1000 —Å–∏–º–≤–æ–ª–æ–≤
