In [1]:
import requests
import json
import re
from tqdm import tqdm

In [2]:
def load_jsonl(jsonl):
    with open(jsonl, 'r') as file:
        return [json.loads(line) for line in list(file)]
    
def load_json(file):
    with open(file, 'r') as f:
        return json.load(f)

In [3]:
validation, train, test = load_jsonl('data/val.jsonl'), load_jsonl('data/train.jsonl'), load_jsonl('data/test.jsonl')

In [10]:
answers, questions, texts = [], [], []
for corpus in train + validation + test:
    texts.append(corpus['passage']['text'])
    for question in corpus['passage']['questions']:
        questions.append(question['question'])
        for answer in question['answers']:
            answers.append(answer['text'])

# Default Translate

In [5]:
def is_need_translation(s):
    return bool(re.search('[а-яА-Я]', s))

# is_need_translation('Хеллоу!') => True

In [7]:
def split_texts(texts, threshold=5, separator=''):
    result = []
    temp = []
    char_counter = 0
    separator_lenght = len(separator)
    for idx, text in enumerate(texts):
        text_lenght = len(text) + separator_lenght
        assert not len(text) > threshold, 'Текст больше маски! (\'%s\' > %d)' % (text, treshold)
        if char_counter + len(text) > threshold:
            result.append(temp.copy())
            temp.clear()
            char_counter = 0
        char_counter += text_lenght
        temp.append(text)
    if temp:
        result.append(temp)
    return result

# split_texts(['Котик', 'это', 'вам', 'не', 'собака!'], 7) => [['Котик'], ['это', 'вам'], ['не'], ['собака!']]

In [8]:
def make_dictionary(text_groups, translator=None):
    dictionary = {}
    for group in tqdm(text_groups):
        translatable, not_translatable = [], []
        for text in group:
            if is_need_translation(text):
                translatable.append(text)
            else:
                not_translatable.append(text)
        if not_translatable:
            dictionary.update({text: text for text in not_translatable})
        if translatable:
            # on global limit exceeded
            try:
                dictionary.update(dict(zip(translatable, translator(translatable))))
            except:
                return dictionary
    return dictionary

# make_dictionary([['Кот', 'it\'s'], ['собака!']]) => {"it's": "it's", 'Кот': 'Cat', 'собака!': 'the dog!'}

In [9]:
def save_dictionary(dictionary, file):
    with open(file + '.json', 'w') as f:
        json.dump(dictionary, f, sort_keys=False, indent=4, ensure_ascii=False, separators=(',', ': '))

# Yandex Translate

! `yc iam create-token` for getting new token if current expires!

In [4]:
yc_folder_id = 'b1g5geju4iflfiqq0fgi'
yc_key = 't1.9euelZrJnZTKmZfHm5uezJXPx57Lle3rnpWajcaNmMacnYmRjI2Li8uSms3l8_d9Q34A-u8vK01N_N3z9z1yewD67y8rTU38.NmvNYWKNh80iJOgdTNz8oxOeuFyBeTzIMyOjQs0qppnc_rTiXNbNPZPKdHHaGaC13kPM9f-jYVo9bLJaFA9PCA'
yc_treshold = 10000

In [6]:
def yc_translate(texts):
    response = requests.post('https://translate.api.cloud.yandex.net/translate/v2/translate', 
                        data = json.dumps({'folder_id': yc_folder_id, 'texts': texts, 'targetLanguageCode': 'en', "sourceLanguageCode": "ru"}),
                       headers={'Content-Type': 'application/json', 'Authorization': 'Bearer ' + yc_key})
    return list(map(lambda translation: translation['text'], json.loads(response.content)['translations']))

# yc_translate(['Привет!']) => ['Hi!']

In [14]:
translated_quesitons = make_dictionary(split_texts(questions, threshold=yc_treshold), translator=yc_translate)
save_dictionary(translated_quesitons, 'data/yc_questions')

100%|██████████| 28/28 [02:00<00:00,  4.31s/it]


In [19]:
translated_answers = make_dictionary(split_texts(answers, threshold=yc_treshold), translator=yc_translate)
save_dictionary(translated_answers, 'data/yc_answers')

100%|██████████| 80/80 [07:37<00:00,  5.72s/it]


In [24]:
translated_texts = make_dictionary(split_texts(texts, threshold=yc_treshold), translator=yc_translate)
save_dictionary(translated_texts, 'data/yc_texts')

100%|██████████| 138/138 [09:35<00:00,  4.17s/it]


# Google Translate

In [11]:
from libs.google_trans import google_translator as GoogleTranslator

In [12]:
translator = GoogleTranslator(url_suffix="ru")  

In [19]:
gl_threshold = 4000
gl_separator = '\n'

In [20]:
def gl_translate(texts):
    sentences = translator.translate('\n'.join(texts), lang_tgt='en', lang_src='ru')  
    translate_text = ""
    for sentence in sentences:
        sentence = sentence[0]
        translate_text += sentence
    return translate_text.split('\n')

#gl_translate(['Привет!', 'Как дела?', 'У меня все хорошо!']) => ['Hello!', 'How are you?', "I'm all good!"]

In [21]:
translated_quesitons = make_dictionary(split_texts(questions, threshold=gl_threshold, separator=gl_separator), translator=gl_translate)
save_dictionary(translated_quesitons, 'data/gl_questions')

100%|██████████| 71/71 [00:34<00:00,  2.04it/s]


In [19]:
translated_answers = make_dictionary(split_texts(answers, threshold=gl_threshold, separator=gl_separator), translator=gl_translate)
save_dictionary(translated_answers, 'data/gl_answers')

100%|██████████| 80/80 [07:37<00:00,  5.72s/it]


In [24]:
translated_texts = make_dictionary(split_texts(texts, threshold=gl_threshold, separator=gl_separator), translator=gl_translate)
save_dictionary(translated_texts, 'data/gl_texts')

100%|██████████| 138/138 [09:35<00:00,  4.17s/it]
