In [89]:
import openai
from openai import AzureOpenAI, AsyncAzureOpenAI
from tqdm.notebook import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import requests
import base64

GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4_TURBO_PREVIEW = "gpt-4-turbo-preview"
GPT_4 = 'gpt-4'
GPT_4o = 'gpt-4o'

def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = get_openai_api_key()
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [90]:
source_lang = 'English'
secondary_lang = 'Russian'
target_lang = 'Finnish'

In [91]:
import json


EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def get_ru_finn_eval_results_from_file(file_name):
    data = []
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

EVAL_FOLDER = '../../data/english-finnish/cards/eval_results/eng_finn_'
def get_eng_finn_eval_results_from_file(file_name):
    data = []
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

RU_ENG_OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def get_ru_eng_cards_from_file(file_name):
    data = []
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


RU_FINN_FOLDER_WITH_JSON = '../../data/russian-finnish/cards/curated_platform_cards/'
def get_ru_finn_cards_from_file(file_name):
    data = []
    file_path = RU_FINN_FOLDER_WITH_JSON + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    res = []
    for card in data:
        if 'isMarkedDeleted' not in card.keys():
            res.append(card)
    return res


ENG_FINN_OUTPUT_FOLDER = '../../data/english-finnish/cards/test_cards/eng_finn_'
def get_eng_finn_cards_from_file(file_name):
    data = []
    file_path = ENG_FINN_OUTPUT_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


OUTPUT_FOLDER = '../../data/english-finnish/cards/test_cards/eng_finn_'
def write_cards_to_file(file_name, cards):
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cards, f, ensure_ascii=False, indent=2)
        
        
EVAL_FOLDER = '../../data/english-finnish/cards/eval_results/eng_finn_'
def write_eval_results_to_file(file_name, results):
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [142]:
def generate_phrase_card(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()

    system_prompt = f'''You are a multilingual assistant who is proficient in {source_lang}, {secondary_lang} and {target_lang}.'''

    user_prompt = f"""
    Translate the given {source_lang} word or phrase: '{source_word}' into clear and natural {target_lang}, reflecting its meaning and context as the primary focus. Use the {secondary_lang} equivalent: '{secondary_word}' as a supportive reference to help clarify or refine the exact context if needed. The goal is to create a {target_lang} translation that is accurate, fluent, and authentic to native speakers, avoiding overly complex or literal phrasing.

    Please provide the {target_lang} translation of the word or phrase and nothing else.

    Important formatting rules:
    1. Ensure punctuation consistency with the source language:
        - If the source word or phrase ends with a period, the translation should also end with a period unless there is another example in brackets
    2. Maintain consistent capitalization rules between source and target languages:
        - If the source word or phrase ends starts with lower case letter, the translation should also start with lower case letter
    3. Return the translation without any external quotation marks:
        - Do not wrap the result in quotes
    4. Return only the translation without any additions
    """

    response = client.chat.completions.create(
        model=model,
        messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
        ]
    )

    tr_word = response.choices[0].message.content.strip()

    card = {}
    card["word"] = tr_word
    card["sentence"] = ""
    card["id"] = card_id

    return card

In [145]:
def generate_full_card(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()

    system_prompt = f'''You are a multilingual assistant who is proficient in {source_lang}, {secondary_lang} and {target_lang}.'''

    user_prompt = f"""
    **Translate the given {source_lang} word or phrase along with its {secondary_lang} translation into {target_lang}, and then translate the provided {source_lang} sentence, incorporating the {target_lang} translation of the word or phrase. Use synonyms or related terms where necessary to convey the intended meaning and maintain naturalness in {target_lang}.**  

    Given word or phrase ({source_lang}): '{source_word}'  
    Given word or phrase ({secondary_lang}): '{secondary_word}'  

    Given sentence ({source_lang}): '{source_sentence}'  
    Given sentence ({secondary_lang}): '{secondary_sentence}'  

    ### Response structure:  

    Respond in JSON format with the following structure:
    {{
        "translatedWord": "Translated word in {target_lang}",
        "translatedSentence": "Translated sentence in {target_lang}"
    }}
    ```

    Important formatting rules:
    1. Ensure punctuation consistency with the source language:
        - If the source word or phrase ends with a period, the translation should also end with a period unless there is another example in brackets
    2. Maintain consistent capitalization rules between source and target languages:
        - If the source word or phrase ends starts with lower case letter, the translation should also start with lower case letter
    3. Return the translation without any external quotation marks:
        - Do not wrap the result in quotes
    """

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
        ]
    )
    
    response_dict = json.loads(response.choices[0].message.content.strip())
    tr_word = response_dict['translatedWord']
    tr_sentence = response_dict['translatedSentence']

    card = {}
    card["word"] = tr_word.strip()
    card["sentence"] = tr_sentence.strip()
    card["id"] = card_id

    return card

In [146]:
def eval_phrase_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()
    target_word, target_sentence, _ = target_card.items()

    system_prompt = f'''You are a multilingual assistant who is proficient in {source_lang}, {secondary_lang} and {target_lang}.'''

    user_prompt = f"""
    **As an AI model, your task is to evaluate the correctness and naturalness of {target_lang} translations for given {source_lang} and {secondary_lang} words or phrases. Check if the {target_lang} translation accurately conveys the meaning and context of the {source_lang} and {secondary_lang} versions, and whether it sounds natural to a native speaker. Your evaluation does not need to suggest the best possible translation, only confirm that it is good enough and identify any issues if present.**  

    When suggesting corrections, provide only the final corrected {target_lang} translation. If no correction is needed, set `suggestedFix` to `null`.  

    Here are the words or phrases:  
    - Word or Phrase in {source_lang}: {source_word}  
    - Word or Phrase in {secondary_lang}: {secondary_word}  
    - Word or Phrase in {target_lang}: {target_word}  

    Respond in JSON format using the following structure:
    {{
      "translationAccuracy": {{
        "isCorrect": true/false,
        "explanation": "Detailed explanation if there is an issue or why it's correct",
        "suggestedFix": "Suggested correction if there is an issue or null if no correction is needed"
      }}
    }}
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = card_id

    return res

In [147]:
def eval_full_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()
    target_word, target_sentence, _ = target_card.items()

    system_prompt = f'''You are a multilingual assistant who is proficient in {source_lang}, {secondary_lang} and {target_lang}.'''

    user_prompt = f"""
    **Evaluate the correctness of a {target_lang} word and sentence based on their translations from {source_lang} and {secondary_lang}. You will receive a word in {source_lang}, {secondary_lang}, and its translation in {target_lang}, as well as a sentence in {source_lang}, {secondary_lang}, and its translation in {target_lang}. Your task is to assess the quality of the {target_lang} sentence, the usage of the {target_lang} word in the sentence, and the accuracy of the translations from {source_lang} and {secondary_lang} to {target_lang}. For each evaluation point, provide a detailed explanation of your judgment and suggest fixes where applicable, either to the {target_lang} word, the {target_lang} sentence, or both.**  

    Please ensure that the {target_lang} sentence is grammatically correct and natural. Suggest a corrected version if necessary. Verify that the {target_lang} sentence contains the {target_lang} word in some form and suggest using synonyms or related terms if the word is missing. Prioritize naturalness and correctness. Ensure that the translations of both the word and sentence from {source_lang} and {secondary_lang} to {target_lang} are accurate and provide corrections if necessary.  

    Respond in JSON format with the following structure:  
    {{
        "sentenceCorrectness": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct.",
            "suggestedFix": "Suggested corrected sentence if there is an issue, or null if not applicable."
        }},
        "wordUsage": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct.",
            "suggestedFixSentence": "Suggested corrected sentence if the word usage is incorrect, or null if not applicable.",
            "suggestedFixWord": "Suggested corrected word if the word usage is incorrect, or null if not applicable."
        }},
        "wordTranslationAccuracy": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct.",
            "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
        }},
        "sentenceTranslationAccuracy": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct.",
            "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
        }}
    }}

    Here are the provided word and sentence in {source_lang}, {secondary_lang}, and {target_lang}:  

    - Word in {source_lang}: {source_word}  
    - Word in {secondary_lang}: {secondary_word}  
    - Word in {target_lang}: {target_word}  
    - Sentence in {source_lang}: {source_sentence}  
    - Sentence in {secondary_lang}: {secondary_sentence}  
    - Sentence in {target_lang}: {target_sentence}  

    Please adhere to this structure to ensure clear, actionable feedback for each evaluation point.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = card_id

    return res

In [148]:
BASE_SYSTEM_PROMPT = f'''You are a multilingual assistant who is proficient in {source_lang}, {secondary_lang} and {target_lang}.'''

In [149]:
def improve_phrase_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result, system_prompt=BASE_SYSTEM_PROMPT, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()
    target_word, target_sentence, _ = target_card.items()

    _, explanation, suggested_fix = eval_result['translationAccuracy'].values()

    user_prompt = f"""
    **Translate the given {source_lang} word or phrase: '{source_word}' into clear and natural {target_lang}, prioritizing its meaning and how it would sound most authentic and fluent to native speakers. Use the {secondary_lang} equivalent: '{secondary_word}' as a supportive reference if it helps refine the context or meaning. While accuracy is important, favor translations that fit naturally into everyday {target_lang}, even if they are not the most direct equivalents.**  

    Consider the following:  
    - Existing {target_lang} translation: '{target_word}'  
    - Issues identified: '{explanation}'  
    - Suggested improvement: '{suggested_fix}'  

    ### Instructions:  
    Based on the information provided, craft a {target_lang} translation that balances accuracy, naturalness, and context. You may adapt the word or phrase slightly to ensure it resonates well with native speakers and fits its intended use.  

    **Respond with only the final {target_lang} translation without any additional explanations, just word or phrase in {target_lang}.**

    Important formatting rules:
    1. Ensure punctuation consistency with the source language:
        - If the source word or phrase ends with a period, the translation should also end with a period unless there is another example in brackets
    2. Maintain consistent capitalization rules between source and target languages:
        - If the source word or phrase ends starts with lower case letter, the translation should also start with lower case letter
    3. Return the translation without any external quotation marks:
        - Do not wrap the result in quotes
    4. Return only the translation without any additions
    """

    response = client.chat.completions.create(
        model=model,
        messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
        ]
    )
    
    tr_word = response.choices[0].message.content.strip()

    card = {}
    card["word"] = tr_word
    card["sentence"] = ""
    card["id"] = card_id

    return card

In [150]:
def improve_full_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result, system_prompt=BASE_SYSTEM_PROMPT, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()
    target_word, target_sentence, _ = target_card.items()

    user_prompt = f"""
    **Translate the given {source_lang} word or phrase into {target_lang} and use it within the provided {source_lang} sentence to create a natural and accurate {target_lang} translation. Use the {secondary_lang} word or sentence as additional context if needed. Focus on accurately conveying the meaning of the {source_lang} source while incorporating all feedback and suggestions from the evaluation results.**  

    When choosing the {target_lang} word or phrase, prioritize naturalness and fluency in the sentence over strict accuracy. While the translation should reflect the original meaning, it is acceptable to use a word or phrase that is not the most direct translation but still conveys the intended sense in a way that sounds natural and idiomatic in {target_lang}.  

    ### **Details to guide your translation:**  

    - **Word or phrase in {source_lang}:** '{source_word}'  
    - **Word or phrase in {secondary_lang}:** '{secondary_word}'  
    - **Sentence in {source_lang}:** '{source_sentence}'  
    - **Sentence in {secondary_lang}:** '{secondary_sentence}'  
    - **Existing {target_lang} translation of the word or phrase:** '{target_word}'  
    - **Existing {target_lang} translation of the sentence:** '{target_sentence}'  

    ### **Evaluation Results Summary:**  

    **{target_lang} Sentence Evaluation:**  
    - **Correctness:** {eval_result['sentenceCorrectness']['isCorrect']}  
    - **Explanation:** {eval_result['sentenceCorrectness']['explanation']}  
    - **Suggested Fix (if applicable):** {eval_result['sentenceCorrectness']['suggestedFix']}  

    **Word Usage Evaluation:**  
    - **Correctness:** {eval_result['wordUsage']['isCorrect']}  
    - **Explanation:** {eval_result['wordUsage']['explanation']}  
    - **Suggested Fix for Word (if applicable):** {eval_result['wordUsage']['suggestedFixWord']}  
    - **Suggested Fix for Sentence (if applicable):** {eval_result['wordUsage']['suggestedFixSentence']}  

    **Word Translation Accuracy Evaluation:**  
    - **Correctness:** {eval_result['wordTranslationAccuracy']['isCorrect']}  
    - **Explanation:** {eval_result['wordTranslationAccuracy']['explanation']}  
    - **Suggested Fix (if applicable):** {eval_result['wordTranslationAccuracy']['suggestedFix']}  

    **Sentence Translation Accuracy Evaluation:**  
    - **Correctness:** {eval_result['sentenceTranslationAccuracy']['isCorrect']}  
    - **Explanation:** {eval_result['sentenceTranslationAccuracy']['explanation']}  
    - **Suggested Fix (if applicable):** {eval_result['sentenceTranslationAccuracy']['suggestedFix']}  

    ### **Instructions:**  

    1. **Review the Evaluation Feedback:**  
    - Carefully consider all provided explanations and suggested fixes for the word or phrase, sentence, and overall translation accuracy.  

    2. **Translate the Word or Phrase:**  
    - Choose a translation that balances accuracy with naturalness.  
    - If the word usage is marked incorrect, incorporate the suggested fix or refine it further for better contextual alignment.  

    3. **Translate the {source_lang} Sentence:**  
    - Integrate the translated word or phrase naturally into the sentence.  
    - If the sentence translation is marked incorrect, incorporate the suggested fixes and adjust for fluency and clarity.  

    4. **Address Translation Accuracy Issues:**  
    - If any translation inaccuracies are identified, apply the suggested fixes or clarify the meaning while ensuring the translation sounds natural and idiomatic.  

    5. **Leverage Context:**  
    - Use the {secondary_lang} word or sentence as additional guidance where necessary.  

    6. **Produce a Polished Result:**  
    - Ensure the final translation conveys the intended meaning, aligns naturally with the sentence, and incorporates feedback from the evaluation results.  

    ### **Response structure:**  

    Respond in JSON format with the following structure:  
    {{
        "translatedWord": "Translated word in {target_lang}",
        "translatedSentence": "Translated sentence in {target_lang}"
    }}

    Important formatting rules:
    1. Ensure punctuation consistency with the source language:
        - If the source word or phrase ends with a period, the translation should also end with a period unless there is another example in brackets
    2. Maintain consistent capitalization rules between source and target languages:
        - If the source word or phrase ends starts with lower case letter, the translation should also start with lower case letter
    3. Return the translation without any external quotation marks (do not wrap the result in quotes)
    """

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
        ]
    )
    
    response_dict = json.loads(response.choices[0].message.content.strip())
    tr_word = response_dict['translatedWord']
    tr_sentence = response_dict['translatedSentence']

    card = {}
    card["word"] = tr_word.strip()
    card["sentence"] = tr_sentence.strip()
    card["id"] = card_id

    return card

In [136]:
def is_phrase_card(card):
    return card['sentence'] == ''

In [137]:
def check_eval(eval_result, is_phrase_card_):
    if is_phrase_card_:
        return eval_result['translationAccuracy']['isCorrect']
    
    w_tr_accuracy = eval_result['wordTranslationAccuracy']['isCorrect']
    s_tr_accuracy = eval_result['sentenceTranslationAccuracy']['isCorrect']
    s_accuracy = eval_result['sentenceCorrectness']['isCorrect']
    w_accuracy = eval_result['wordUsage']['isCorrect']
    
    return all([w_tr_accuracy, s_tr_accuracy, s_accuracy, w_accuracy])

In [138]:
def regressor(source_card, secondary_card, target_cards, card_id, source_lang, target_lang, secondary_lang, n_models=3, model=GPT_4o):

    source_word, source_sentence, _ = source_card.items()
    secondary_word, secondary_sentence, _ = secondary_card.items()

    target_words = [card['word'] for card in target_cards]
    target_sentences = [card['sentence'] for card in target_cards]

    models_response = ''
    for i in range(n_models):
        models_response += f"""{i + 1}. `Model {i + 1}` Response: {{ "{target_lang}Word": '{target_words[i]}', "{target_lang}Sentence": '{target_sentences[i]}'}}\n"""

    system_prompt = f'''Act as a translation evaluator and synthesizer. Assess model-generated translations for a {source_lang} word/phrase and its sentence, prioritizing accuracy, fluency, and contextual fit. Return the best translations in JSON format with refinements if necessary.'''

    user_prompt = f"""
**Instruction for the Regressor**:  
Evaluate the translations provided by different models for a given {source_lang} word or phrase and its accompanying sentence. Select or synthesize the best {target_lang} translation for the word and sentence based on:
1. **Accuracy**: Ensure the translation reflects the original {source_lang} meaning correctly.
2. **Naturalness**: The {target_lang} translations should be fluid and use appropriate synonyms or phrasing where needed.
3. **Consistency**: The translated sentence should appropriately incorporate the word's translation.

### Input:
Given word or phrase ({source_lang}): '{source_word}'  
Given word or phrase ({secondary_lang}): '{secondary_word}'  

Given sentence ({source_lang}): '{source_sentence}'  
Given sentence ({secondary_lang}): '{secondary_sentence}'  

Responses from models:
{models_response}

---

### Task:
Critically evaluate these responses, identify the best translations, and synthesize a single, high-quality translation. If needed, refine the translations to ensure accuracy and naturalness. Do not simply copy; improve where necessary.

### Response structure:
Respond in JSON format with the following structure:
{{
    "{target_lang}Word": "Best translated word in {target_lang}",
    "{target_lang}Sentence": "Best translated sentence in {target_lang}"
}}

---

### Key Considerations for Evaluation:
- **Accuracy**: Does the translation reflect the original meaning and nuances in {source_lang}?
- **Fluency**: Is the translation grammatically correct and natural in {target_lang}?
- **Contextual Fit**: Does the sentence correctly integrate the translation of the word or phrase?

Important formatting rules:
1. Ensure punctuation consistency with the source language:
    - If the source word or phrase ends with a period, the translation should also end with a period unless there is another example in brackets
2. Maintain consistent capitalization rules between source and target languages:
    - If the source word or phrase ends starts with lower case letter, the translation should also start with lower case letter
3. Return the translation without any external quotation marks (do not wrap the result in quotes)
    """

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
        ]
    )
    
    response_dict = json.loads(response.choices[0].message.content.strip())
    target_word = response_dict[f'{target_lang}Word']
    target_sentence = response_dict[f'{target_lang}Sentence']

    card = {}
    card["word"] = target_word.strip()
    card["sentence"] = target_sentence.strip()
    card["id"] = card_id

    return card


In [139]:
system_prompts = [
    "Act as a linguistic expert specializing in Russian, Finnish, and English translations. Evaluate the given English word and sentence translations in conjunction with the Russian context. Based on evaluation feedback, refine the word and sentence translations for accuracy, fluency, and naturalness, ensuring idiomatic Finnish usage. Provide the response in JSON format.",
    "Take on the role of a translator focusing on contextual accuracy. Use the provided English and Russian inputs, along with feedback evaluations, to create an Finnish translation that balances precise meaning with natural phrasing. Address inaccuracies and ensure the translation reads fluent?ly and idiomatically. Respond in JSON format.",
    "Serve as a cultural localization specialist. Translate the provided English word and sentence into Finnish, using the Russian context and feedback to enhance naturalness and idiomatic expression. Focus on making the translation contextually relevant and fluent for an Finnish-speaking audience. Provide your output in JSON format."
]

In [103]:
def meaxture_of_improving_agents(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result, n_models=3):
    is_phrase_card_ = is_phrase_card(source_card)

    target_cards = []
    for s_p in system_prompts:
        improved_eng_finn_card = {}

        if is_phrase_card_:
            improved_eng_finn_card = improve_phrase_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result, system_prompt=s_p)
        else:
            improved_eng_finn_card = improve_full_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result, system_prompt=s_p)

        target_cards.append(improved_eng_finn_card)

    card = regressor(source_card, secondary_card, target_cards, card_id, source_lang, target_lang, secondary_lang, n_models)

    return card

In [104]:
def card_handler_with_agents(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang, num_iter=5, num_agents=3):
    is_phrase_card_ = is_phrase_card(source_card)

    target_card = {}

    if is_phrase_card_:
        target_card = generate_phrase_card(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang)
    else:
        target_card = generate_full_card(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang)

    eval_result = {}

    if is_phrase_card_:
        eval_result = eval_phrase_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang)
    else:
        eval_result = eval_full_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang)

    for _ in range(num_iter):
        if check_eval(eval_result, is_phrase_card_):
            break
        
        if is_phrase_card_:
            target_card = meaxture_of_improving_agents(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result)
        else:
            target_card = meaxture_of_improving_agents(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang, eval_result)

        if is_phrase_card_:
            eval_result = eval_phrase_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang)
        else:
            eval_result = eval_full_card(source_card, secondary_card, target_card, card_id, source_lang, target_lang, secondary_lang)
    
    return target_card, eval_result

In [105]:
import logging

logging.basicConfig(
    filename='errors.txt',
    level=logging.ERROR,
    format='%(message)s',
    filemode='a'
)

def generate_source_target_cards_with_eval_results(source_cards, secondary_cards, source_lang, target_lang, secondary_lang):

    source_target_cards = []
    eval_results = []

    for source_card, secondary_card in tqdm(list(zip(source_cards, secondary_cards))):
        card_id = source_card['id']

        while True:
            try:
                target_card, eval_res = card_handler_with_agents(source_card, secondary_card, card_id, source_lang, target_lang, secondary_lang)

                card = {}
                card["wordFirstLang"] = source_card['word']
                card["sentenceFirstLang"] = source_card['sentence']
                card["wordSecondLang"] = target_card['word']
                card["sentenceSecondLang"] = target_card['sentence']
                card["id"] = card_id

                source_target_cards.append(card)
                eval_results.append(eval_res)
                break
            except Exception as e:
                logging.error(str(e))
                pass

    assert len(source_target_cards) == len(eval_results)

    return source_target_cards, eval_results

In [26]:
def example():
    ru_finn_cards = get_ru_finn_cards_from_file('sm1_new_kap1.json')

    source_cards = [{'word': card["wordSecondLang"], 'sentence': card["sentenceSecondLang"], 'id': card['id']} for card in ru_finn_cards]
    secondary_cards = [{'word': card["wordFirstLang"], 'sentence': card["sentenceFirstLang"], 'id': card['id']} for card in ru_finn_cards]

    source_lang, target_lang, secondary_lang = 'Finnish', 'English', 'Russian'

    source_target_cards, eval_results = generate_source_target_cards_with_eval_results(source_cards, secondary_cards, source_lang, target_lang, secondary_lang)

    res = []
    for card, eval_res in list(zip(source_target_cards, eval_results)):
        res.append(check_eval(eval_res, card['sentenceFirstLang'] == ''))
    
    print(sum(res) / len(res))

In [None]:
example()

  0%|          | 0/151 [00:00<?, ?it/s]

In [25]:
def get_bad_cards_from_files(file_names):
    bad_ru_finn_cards = []
    bad_eng_finn_cards = []
    for file_name in file_names:
        ru_finn_cards = get_ru_finn_cards_from_file(file_name)
        eng_finn_cards = get_eng_finn_cards_from_file(file_name)

        for eng_finn, ru_finn in list(zip(eng_finn_cards, ru_finn_cards)):
            if eng_finn['wordSecondLang'] != ru_finn['wordSecondLang'] or eng_finn['sentenceSecondLang'] != ru_finn['sentenceSecondLang']:
                bad_eng_finn_cards.append(eng_finn)
                bad_ru_finn_cards.append(ru_finn)

    return bad_ru_finn_cards, bad_eng_finn_cards

In [21]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
file_names.append('sm2_new_puhekieli.json')
file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])

In [26]:
bad_ru_finn_cards, bad_eng_finn_cards = get_bad_cards_from_files(file_names)

In [47]:
source_cards = [{'word': card["wordSecondLang"], 'sentence': card["sentenceSecondLang"], 'id': card['id']} for card in bad_ru_finn_cards]
secondary_cards = [{'word': card["wordFirstLang"], 'sentence': card["sentenceFirstLang"], 'id': card['id']} for card in bad_ru_finn_cards]

source_lang, target_lang, secondary_lang = 'Finnish', 'English', 'Russian'

source_target_cards, eval_results = generate_source_target_cards_with_eval_results(source_cards, secondary_cards, source_lang, target_lang, secondary_lang)

  0%|          | 0/471 [00:00<?, ?it/s]

In [None]:
results = []

for ru_eng, eval_res in list(zip(source_target_cards, eval_results)):
    results.append(check_eval(eval_res, (ru_eng['sentenceFirstLang'] == '')))

sum(results) / len(results)

0.8535031847133758

In [49]:
ids = [card['id'] for card in source_target_cards]

In [50]:
def check_eval_old(eval_result, is_phrase_card):
    if is_phrase_card:
        return eval_result['translationAccuracy']['isCorrect']
    
    w_tr_accuracy = eval_result['wordTranslationAccuracy']['isCorrect']
    s_tr_accuracy = eval_result['sentenceTranslationAccuracy']['isCorrect']
    s_accuracy = eval_result['englishSentenceCorrectness']['isCorrect']
    w_accuracy = eval_result['wordUsage']['isCorrect']
    
    return all([w_tr_accuracy, s_tr_accuracy, s_accuracy, w_accuracy])

In [51]:
def check_all_cards_from_files(file_names):
    results = []
    for file_name in file_names:
        eng_finn_cards = get_eng_finn_cards_from_file(file_name)
        eng_finn_eval = get_eng_finn_eval_results_from_file(file_name)

        for eng_finn, eval_res in list(zip(eng_finn_cards, eng_finn_eval)):
            is_phrase_card = (eng_finn['sentenceFirstLang'] == '')

            if eng_finn['id'] in ids:
                for true_eval in eval_results:
                    if true_eval['id'] == eng_finn['id']:
                        results.append(check_eval(true_eval, is_phrase_card))
                        break
            else:
                results.append(check_eval_old(eval_res, is_phrase_card))

    return sum(results) / len(results)

In [52]:
check_all_cards_from_files(file_names)

0.9842196623007733

In [66]:
def swap_cards(card_list):
    new_list = []
    for card in card_list:
        card['wordFirstLang'], card['sentenceFirstLang'], card['wordSecondLang'], card['sentenceSecondLang'] = card['wordSecondLang'], card['sentenceSecondLang'], card['wordFirstLang'], card['sentenceFirstLang']
        new_list.append(card)
    return new_list

In [67]:
final_cards = swap_cards(source_target_cards)

In [69]:
def update_eng_finn_cards(file_name):
    new_eng_finn_cards = []
    new_eng_finn_evals = []

    eng_finn_cards = get_eng_finn_cards_from_file(file_name)
    eng_finn_eval = get_eng_finn_eval_results_from_file(file_name)
    
    for eng_finn, eval_res in list(zip(eng_finn_cards, eng_finn_eval)):
        if eng_finn['id'] in ids:
            for card, true_eval in list(zip(final_cards, eval_results)):
                if card['id'] == eng_finn['id']:
                    new_eng_finn_cards.append(card)
                    new_eng_finn_evals.append(true_eval)
                    break
        else:
            new_eng_finn_cards.append(eng_finn)
            new_eng_finn_evals.append(eval_res)
    
    write_cards_to_file(file_name, new_eng_finn_cards)
    write_eval_results_to_file(file_name, new_eng_finn_evals)

In [70]:
for file_name in file_names:
    update_eng_finn_cards(file_name)

In [73]:
should_be_empty, _ = get_bad_cards_from_files(file_names)

In [74]:
should_be_empty

[]

In [75]:
test_cards = []
with open('./test_cards.json', 'r', encoding='utf-8') as file:
    test_cards = json.load(file)
test_cards

[{'id': 6130,
  'translations': {'ru': {'word': 'турнир',
    'sentence': 'Турнир проводится в субботу в школьном зале.'},
   'fi': {'word': 'turnaus',
    'sentence': 'Turnaus järjestetään lauantaina koulun salissa.'},
   'en': {'word': 'tournament',
    'sentence': 'The tournament is held on Saturday in the school hall.'}}},
 {'id': 6141,
  'translations': {'ru': {'word': 'матч',
    'sentence': 'Матч начинается в 18:00.'},
   'fi': {'word': 'ottelu', 'sentence': 'Ottelu alkaa klo 18.'},
   'en': {'word': 'match', 'sentence': 'The match starts at 6:00 PM.'}}},
 {'id': 6182,
  'translations': {'ru': {'word': 'соревнование',
    'sentence': 'Соревнование начинается ровно в десять часов.'},
   'fi': {'word': 'kilpailu (kisa)',
    'sentence': 'Kisa alkaa tasan kello kymmenen.'},
   'en': {'word': 'competition',
    'sentence': "The competition begins precisely at ten o'clock."}}},
 {'id': 6192,
  'translations': {'ru': {'word': 'отправление',
    'sentence': 'Отправление завтра в девять

In [77]:
def create_cards_list(base_cards, source_key, secondary_key):
    source_cards = []
    secondary_cards = []

    for item in base_cards:
        source_card = item['translations'][source_key]
        source_card['id'] = item['id']

        secondary_card = item['translations'][secondary_key]
        secondary_card['id'] = item['id']

        source_cards.append(source_card)
        secondary_cards.append(secondary_card)
    
    return source_cards, secondary_cards


In [121]:
test_source_cards, test_secondary_cards = create_cards_list(test_cards, 'en', 'ru')

In [98]:
test_source_target_cards, test_eval_results = generate_source_target_cards_with_eval_results(test_source_cards, test_secondary_cards, 'Finnish', 'Kazakh', 'English')

  0%|          | 0/13 [00:00<?, ?it/s]

In [99]:
test_source_target_cards

[{'wordFirstLang': 'turnaus',
  'sentenceFirstLang': 'Turnaus järjestetään lauantaina koulun salissa.',
  'wordSecondLang': 'турнир/жарыс',
  'sentenceSecondLang': 'Турнир сенбі күні мектеп залында өткізіледі.',
  'id': 6130},
 {'wordFirstLang': 'ottelu',
  'sentenceFirstLang': 'Ottelu alkaa klo 18.',
  'wordSecondLang': 'сөз, матч',
  'sentenceSecondLang': 'Матч кешкі сағат 6-да басталады.',
  'id': 6141},
 {'wordFirstLang': 'kilpailu (kisa)',
  'sentenceFirstLang': 'Kisa alkaa tasan kello kymmenen.',
  'wordSecondLang': 'жарыс',
  'sentenceSecondLang': 'Жарыс дәл сағат онде басталады.',
  'id': 6182},
 {'wordFirstLang': 'lähtö',
  'sentenceFirstLang': 'Lähtö on huomenna kello yhdeksän.',
  'wordSecondLang': 'кету',
  'sentenceSecondLang': 'Кету ертең сағат тоғызда.',
  'id': 6192},
 {'wordFirstLang': 'häviö (tappio)',
  'sentenceFirstLang': 'Joukkue kärsi kirvelevän häviön eilisessä ottelussa.',
  'wordSecondLang': 'жеңіліс',
  'sentenceSecondLang': 'Команда кеше өткен матчта ащы жең

In [122]:
test_source_target_cards, test_eval_results = generate_source_target_cards_with_eval_results(test_source_cards, test_secondary_cards, 'English', 'Kazakh', 'Russian')

  0%|          | 0/13 [00:00<?, ?it/s]

In [123]:
test_source_target_cards

[{'wordFirstLang': 'tournament',
  'sentenceFirstLang': 'The tournament is held on Saturday in the school hall.',
  'wordSecondLang': 'турнир',
  'sentenceSecondLang': 'Турнир сенбі күні мектеп залында өтеді.',
  'id': 6130},
 {'wordFirstLang': 'match',
  'sentenceFirstLang': 'The match starts at 6:00 PM.',
  'wordSecondLang': 'матч',
  'sentenceSecondLang': 'Матч кешкі сағат 18:00-де басталады.',
  'id': 6141},
 {'wordFirstLang': 'competition',
  'sentenceFirstLang': "The competition begins precisely at ten o'clock.",
  'wordSecondLang': 'жарыс',
  'sentenceSecondLang': 'Жарыс дәл сағат онда басталады.',
  'id': 6182},
 {'wordFirstLang': 'departure',
  'sentenceFirstLang': "The departure is tomorrow at nine o'clock.",
  'wordSecondLang': 'кету',
  'sentenceSecondLang': 'Кету ертең сағат тоғызда.',
  'id': 6192},
 {'wordFirstLang': 'defeat',
  'sentenceFirstLang': "The team suffered a bitter defeat in yesterday's match.",
  'wordSecondLang': 'жеңіліс',
  'sentenceSecondLang': 'Кеше мат

In [124]:
nl_source_cards, nl_secondary_cards = create_cards_list(test_cards, 'en', 'fi')

In [125]:
nl_source_target_cards, nl_eval_results = generate_source_target_cards_with_eval_results(nl_source_cards, nl_secondary_cards, 'English', 'Dutch', 'Finnish')

  0%|          | 0/13 [00:00<?, ?it/s]

In [126]:
nl_source_target_cards

[{'wordFirstLang': 'tournament',
  'sentenceFirstLang': 'The tournament is held on Saturday in the school hall.',
  'wordSecondLang': 'toernooi',
  'sentenceSecondLang': 'Het toernooi wordt gehouden op zaterdag in de schoolzaal.',
  'id': 6130},
 {'wordFirstLang': 'match',
  'sentenceFirstLang': 'The match starts at 6:00 PM.',
  'wordSecondLang': 'wedstrijd',
  'sentenceSecondLang': 'De wedstrijd begint om 18:00 uur.',
  'id': 6141},
 {'wordFirstLang': 'competition',
  'sentenceFirstLang': "The competition begins precisely at ten o'clock.",
  'wordSecondLang': 'wedstrijd',
  'sentenceSecondLang': 'De wedstrijd begint precies om tien uur.',
  'id': 6182},
 {'wordFirstLang': 'departure',
  'sentenceFirstLang': "The departure is tomorrow at nine o'clock.",
  'wordSecondLang': 'vertrek',
  'sentenceSecondLang': 'Het vertrek is morgen om negen uur.',
  'id': 6192},
 {'wordFirstLang': 'defeat',
  'sentenceFirstLang': "The team suffered a bitter defeat in yesterday's match.",
  'wordSecondLan

In [128]:
kz_cards = [{'word': card["wordSecondLang"], 'sentence': card["sentenceSecondLang"]} for card in test_source_target_cards]
nl_cards = [{'word': card["wordSecondLang"], 'sentence': card["sentenceSecondLang"]} for card in nl_source_target_cards]

In [131]:
for kz, nl, final_set in list(zip(kz_cards, nl_cards, test_cards)):
    final_set['translations']['kz']['id'] = final_set['id']
    final_set['translations']['nl']['id'] = final_set['id']

In [132]:
test_cards

[{'id': 6130,
  'translations': {'ru': {'word': 'турнир',
    'sentence': 'Турнир проводится в субботу в школьном зале.',
    'id': 6130},
   'fi': {'word': 'turnaus',
    'sentence': 'Turnaus järjestetään lauantaina koulun salissa.',
    'id': 6130},
   'en': {'word': 'tournament',
    'sentence': 'The tournament is held on Saturday in the school hall.',
    'id': 6130},
   'kz': {'word': 'турнир',
    'sentence': 'Турнир сенбі күні мектеп залында өтеді.',
    'id': 6130},
   'nl': {'word': 'toernooi',
    'sentence': 'Het toernooi wordt gehouden op zaterdag in de schoolzaal.',
    'id': 6130}}},
 {'id': 6141,
  'translations': {'ru': {'word': 'матч',
    'sentence': 'Матч начинается в 18:00.',
    'id': 6141},
   'fi': {'word': 'ottelu', 'sentence': 'Ottelu alkaa klo 18.', 'id': 6141},
   'en': {'word': 'match',
    'sentence': 'The match starts at 6:00 PM.',
    'id': 6141},
   'kz': {'word': 'матч',
    'sentence': 'Матч кешкі сағат 18:00-де басталады.',
    'id': 6141},
   'nl': {

In [133]:
with open('./test_cards.json', 'w', encoding='utf-8') as f:
    json.dump(test_cards, f, ensure_ascii=False, indent=2)

In [85]:
import os
import json
from pathlib import Path

def get_english_card_ids():
    all_ids = set()
    
    file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
    file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
    file_names.append('sm2_new_puhekieli.json')
    file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
    file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])
    
    for file_name in file_names:
        cards = get_eng_finn_cards_from_file(file_name)
        ids = [card['id'] for card in cards]
        all_ids.update(ids)
    
    return all_ids

def get_russian_card_ids():
    all_ids = set()
    
    file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
    file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
    file_names.append('sm2_new_puhekieli.json')
    file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
    file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])
    
    for file_name in file_names:
        cards = get_ru_finn_cards_from_file(file_name)
        ids = [card['id'] for card in cards]
        all_ids.update(ids)
    
    return all_ids

def print_missing_ids():
    eng_ids = get_english_card_ids()
    rus_ids = get_russian_card_ids()
    
    missing_ids = eng_ids - rus_ids
    
    missing_ids = sorted(list(missing_ids))
    
    print(f"Всего англо-финских карточек: {len(eng_ids)}")
    print(f"Всего русско-финских карточек: {len(rus_ids)}")
    print(f"Отсутствующих карточек: {len(missing_ids)}")
    print("\nСписок отсутствующих ID:")
    for id in missing_ids:
        print(id)

In [86]:
print_missing_ids()

Всего англо-финских карточек: 6337
Всего русско-финских карточек: 6337
Отсутствующих карточек: 0

Список отсутствующих ID:


In [87]:
def check_ru_fi_case_and_punctuation_consistency():
    file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
    file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
    file_names.append('sm2_new_puhekieli.json')
    file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
    file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])
    
    case_inconsistencies = 0
    dot_inconsistencies = 0
    
    for file_name in file_names:
        cards = get_ru_finn_cards_from_file(file_name)
        
        for card in cards:
            fi_word = card['wordSecondLang']
            ru_word = card['wordFirstLang'] 
            fi_sentence = card['sentenceSecondLang']
            ru_sentence = card['sentenceFirstLang']
            
            try:
                if ru_word[0].isupper() != fi_word[0].isupper():
                    case_inconsistencies += 1
                    print(f"Несогласованность регистра в слове, карточка {card['id']}:")
                    print(f"RU: {ru_word}")
                    print(f"FI: {fi_word}\n")
            except:
                print(f"Warning: {card['id']}:")
                
            if ru_word.endswith('.') != fi_word.endswith('.'):
                dot_inconsistencies += 1
                print(f"Несогласованность точки в слове, карточка {card['id']}:")
                print(f"RU: {ru_word}")
                print(f"FI: {fi_word}\n")
                
            if ru_sentence != '':
                if ru_sentence[0].isupper() != fi_sentence[0].isupper():
                    case_inconsistencies += 1
                    print(f"Несогласованность регистра в предложении, карточка {card['id']}:")
                    print(f"RU: {ru_sentence}")
                    print(f"FI: {fi_sentence}\n")
                
            if ru_sentence.endswith('.') != fi_sentence.endswith('.'):
                dot_inconsistencies += 1
                print(f"Несогласованность точки в предложении, карточка {card['id']}:")
                print(f"RU: {ru_sentence}") 
                print(f"FI: {fi_sentence}\n")
                
    print(f"Несогласованность регистра: {case_inconsistencies} карточек")
    print(f"Несогласованность точки в конце: {dot_inconsistencies} карточек")
    print(f"Всего несогласованностей: {case_inconsistencies + dot_inconsistencies}")

check_ru_fi_case_and_punctuation_consistency()



Несогласованность регистра в слове, карточка 1407:
RU: +30 градусов. Жарко.
FI: On +30 astetta. On kuuma.

Несогласованность регистра в слове, карточка 1409:
RU: +20 градусов. Тепло.
FI: On +20 astetta. On lämmin.

Несогласованность регистра в слове, карточка 1410:
RU: +2 градуса. Прохладно.
FI: On +2 astetta. On viileä.

Несогласованность регистра в слове, карточка 1411:
RU: -15 градусов. холодно.
FI: On -15 astetta. On kylmä.

Несогласованность точки в предложении, карточка 1747:
RU: Она любит его. (Он любит её.)
FI: Hän rakastaa häntä.

Несогласованность регистра в предложении, карточка 1934:
RU: 9 часов ровно.
FI: Kello on tasan yhdeksän.

Несогласованность регистра в слове, карточка 2198:
RU: в Вантаа
FI: Vantaalla

Несогласованность регистра в слове, карточка 2199:
RU: в Тампере
FI: Tampereella

Несогласованность регистра в слове, карточка 2200:
RU: в Рованиеми
FI: Rovaniemellä

Несогласованность регистра в слове, карточка 2201:
RU: в Иматре
FI: Imatralla

Несогласованность регис

In [120]:
def check_en_fi_case_and_punctuation_consistency():
    file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
    file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
    file_names.append('sm2_new_puhekieli.json')
    file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
    file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])
    
    case_inconsistencies = 0
    dot_inconsistencies = 0
    
    bad_cards_ids = []
    for file_name in file_names:
        cards = get_eng_finn_cards_from_file(file_name)
        
        for card in cards:
            fi_word = card['wordSecondLang']
            en_word = card['wordFirstLang'] 
            fi_sentence = card['sentenceSecondLang']
            en_sentence = card['sentenceFirstLang']

            bad_card = False
            
            try:
                if en_word[0].isupper() != fi_word[0].isupper():
                    case_inconsistencies += 1
                    bad_card = True
                    print(f"Несогласованность регистра в слове, карточка {card['id']}:")
                    print(f"EN: {en_word}")
                    print(f"FI: {fi_word}\n")
            except:
                bad_card = True
                print(f"Warning: {card['id']}:")
                
            if en_word.endswith('.') != fi_word.endswith('.'):
                dot_inconsistencies += 1
                bad_card = True
                print(f"Несогласованность точки в слове, карточка {card['id']}:")
                print(f"EN: {en_word}")
                print(f"FI: {fi_word}\n")
                
            if en_sentence != '' and fi_sentence != '':
                if en_sentence[0].isupper() != fi_sentence[0].isupper():
                    case_inconsistencies += 1
                    bad_card = True
                    print(f"Несогласованность регистра в предложении, карточка {card['id']}:")
                    print(f"EN: {en_sentence}")
                    print(f"FI: {fi_sentence}\n")
                
            if en_sentence.endswith('.') != fi_sentence.endswith('.'):
                dot_inconsistencies += 1
                bad_card = True
                print(f"Несогласованность точки в предложении, карточка {card['id']}:")
                print(f"EN: {en_sentence}") 
                print(f"FI: {fi_sentence}\n")

            if bad_card:
                bad_cards_ids.append(card['id'])
                
    print(f"Несогласованность регистра: {case_inconsistencies} карточек")
    print(f"Несогласованность точки в конце: {dot_inconsistencies} карточек")
    print(f"Всего несогласованностей: {case_inconsistencies + dot_inconsistencies}")

    return bad_cards_ids


In [121]:
en_fi_bad_cards_ids = check_en_fi_case_and_punctuation_consistency()

Несогласованность регистра в слове, карточка 1005:
EN: I
FI: minä

Несогласованность регистра в слове, карточка 1018:
EN: Last name
FI: sukunimi

Несогласованность точки в слове, карточка 1039:
EN: Yes.
FI: Joo

Несогласованность регистра в слове, карточка 1042:
EN: Monday
FI: maanantai

Несогласованность регистра в слове, карточка 1043:
EN: Tuesday
FI: tiistai

Несогласованность регистра в слове, карточка 1044:
EN: Wednesday
FI: keskiviikko

Несогласованность регистра в слове, карточка 1045:
EN: Thursday
FI: torstai

Несогласованность регистра в слове, карточка 1046:
EN: Friday
FI: perjantai

Несогласованность регистра в слове, карточка 1047:
EN: Saturday
FI: lauantai

Несогласованность регистра в слове, карточка 1048:
EN: Sunday
FI: sunnuntai

Несогласованность регистра в слове, карточка 1049:
EN: Weekend
FI: viikonloppu

Несогласованность регистра в слове, карточка 1053:
EN: On Tuesday
FI: tiistaina

Несогласованность регистра в слове, карточка 1054:
EN: On Wednesday
FI: keskiviikko

In [122]:
def get_ru_finn_cards_by_ids(ids):
    file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]
    file_names.extend([f'sm2_new_kap{i}.json' for i in range(1, 9)])
    file_names.append('sm2_new_puhekieli.json')
    file_names.extend([f'sm3_kap{i}.json' for i in range(1, 9)])
    file_names.extend([f'sm4_kap{i}.json' for i in range(1, 6)])

    cards = []
    for file_name in file_names:
        cards.extend(get_ru_finn_cards_from_file(file_name))

    return [card for card in cards if card['id'] in ids]

bad_ru_finn_cards = get_ru_finn_cards_by_ids(en_fi_bad_cards_ids)

In [124]:
len(bad_ru_finn_cards)

600

In [140]:
source_cards = [{'word': card["wordSecondLang"], 'sentence': card["sentenceSecondLang"], 'id': card['id']} for card in bad_ru_finn_cards[0:50]]
secondary_cards = [{'word': card["wordFirstLang"], 'sentence': card["sentenceFirstLang"], 'id': card['id']} for card in bad_ru_finn_cards[0:50]]

source_lang, target_lang, secondary_lang = 'Finnish', 'English', 'Russian'

source_target_cards, eval_results = generate_source_target_cards_with_eval_results(source_cards, secondary_cards, source_lang, target_lang, secondary_lang)

  0%|          | 0/50 [00:00<?, ?it/s]

In [141]:
source_target_cards

[{'wordFirstLang': 'minä',
  'sentenceFirstLang': 'Minä olen opiskelija.',
  'wordSecondLang': 'I',
  'sentenceSecondLang': 'I am a student.',
  'id': 1005},
 {'wordFirstLang': 'sukunimi',
  'sentenceFirstLang': 'Minun sukunimeni on Ivanova.',
  'wordSecondLang': 'last name',
  'sentenceSecondLang': 'My last name is Ivanova.',
  'id': 1018},
 {'wordFirstLang': 'Joo',
  'sentenceFirstLang': '',
  'wordSecondLang': 'yes',
  'sentenceSecondLang': '',
  'id': 1039},
 {'wordFirstLang': 'maanantai',
  'sentenceFirstLang': '',
  'wordSecondLang': 'monday',
  'sentenceSecondLang': '',
  'id': 1042},
 {'wordFirstLang': 'tiistai',
  'sentenceFirstLang': '',
  'wordSecondLang': 'Tuesday',
  'sentenceSecondLang': '',
  'id': 1043},
 {'wordFirstLang': 'keskiviikko',
  'sentenceFirstLang': '',
  'wordSecondLang': 'Wednesday',
  'sentenceSecondLang': '',
  'id': 1044},
 {'wordFirstLang': 'torstai',
  'sentenceFirstLang': '',
  'wordSecondLang': 'thursday',
  'sentenceSecondLang': '',
  'id': 1045},
 

In [128]:
source_target_cards

[{'wordFirstLang': 'seitsemän',
  'sentenceFirstLang': '',
  'wordSecondLang': 'seven',
  'sentenceSecondLang': '',
  'id': 1075},
 {'wordFirstLang': 'kahdeksan',
  'sentenceFirstLang': '',
  'wordSecondLang': 'eight',
  'sentenceSecondLang': '',
  'id': 1076},
 {'wordFirstLang': 'yhdeksän',
  'sentenceFirstLang': '',
  'wordSecondLang': 'nine',
  'sentenceSecondLang': '',
  'id': 1077},
 {'wordFirstLang': 'kymmenen',
  'sentenceFirstLang': '',
  'wordSecondLang': 'ten',
  'sentenceSecondLang': '',
  'id': 1078},
 {'wordFirstLang': 'yksitoista',
  'sentenceFirstLang': '',
  'wordSecondLang': 'eleven',
  'sentenceSecondLang': '',
  'id': 1079},
 {'wordFirstLang': 'kaksitoista',
  'sentenceFirstLang': '',
  'wordSecondLang': 'twelve',
  'sentenceSecondLang': '',
  'id': 1080},
 {'wordFirstLang': 'kolmekymmentä',
  'sentenceFirstLang': '',
  'wordSecondLang': 'thirty',
  'sentenceSecondLang': '',
  'id': 1081},
 {'wordFirstLang': 'neljätoista',
  'sentenceFirstLang': '',
  'wordSecondLang