In [1]:
import openai
from openai import AzureOpenAI, AsyncAzureOpenAI
from tqdm.notebook import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import requests
import base64

GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4_TURBO_PREVIEW = "gpt-4-turbo-preview"
GPT_4 = 'gpt-4'
GPT_4o = 'gpt-4o'

def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = get_openai_api_key()
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [2]:
import json


EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def get_eval_results_from_file(file_name):
    data = []
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


RU_ENG_OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def get_ru_eng_cards_from_file(file_name):
    data = []
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


RU_FINN_FOLDER_WITH_JSON = '../../data/russian-finnish/cards/curated_platform_cards/'
def get_ru_finn_cards_from_file(file_name):
    data = []
    file_path = RU_FINN_FOLDER_WITH_JSON + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    res = []
    for card in data:
        if 'isMarkedDeleted' not in card.keys():
            res.append(card)
    return res


OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def write_cards_to_file(file_name, cards):
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cards, f, ensure_ascii=False, indent=2)
        
        
EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def write_eval_results_to_file(file_name, results):
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [3]:
def eval_phrase_card(card, first_lang='Russian', second_lang='Finnish', model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = f'''You are a multilingual assistant who is proficient in {first_lang} and {second_lang}.'''

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of {second_lang} translations for a given {first_lang} word or phrase. Confirm the following:

The {second_lang} translation accurately conveys the meaning and context of the {first_lang} version.
The {second_lang} translation sounds natural to a native speaker.

**Guidelines**:

The translated word or phrase does not need to be the most common or best possible translation of the {first_lang} version. As long as it retains the original meaning and sounds natural and correct, it is acceptable.
Your evaluation should focus on whether the translation is acceptable or identify any issues if present.
If corrections are needed, provide only the final corrected {second_lang} translation.
If no correction is needed, set suggestedFix to null.

**Input Example**:

Word or Phrase in {first_lang}: {src_word}
Word or Phrase in {second_lang}: {tr_word}

**Output Format**: Respond in JSON using the structure below:
 
{{
  "translationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct",
    "suggestedFix": "Suggested correction if there is an issue or null if no correction is needed"
  }}
}}

**Notes**:

The {second_lang} translation does not have to be the most commonly used or the best translation, as long as it preserves the meaning of the {first_lang} word or phrase.
Prioritize naturalness, correctness, and clarity when evaluating the translation.
Ensure all explanations are detailed, clear, and actionable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

In [4]:
def eval_full_card(card, first_lang='Russian', second_lang='Finnish', model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = f'''You are a multilingual assistant who is proficient in {first_lang} and {second_lang}.'''

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of an {second_lang} word and sentence based on their translations from {first_lang}. Assess the following:

The quality and naturalness of the {second_lang} sentence.
The accuracy of the word’s translation and its usage in the sentence.
The overall accuracy of the translations for both the word and sentence from {first_lang} to {second_lang}.

**Evaluation Points**:

Verify that the {second_lang} sentence is grammatically correct, natural, and conveys the meaning of the {first_lang} sentence. The sentence does not need to be an exact translation as long as it preserves the original meaning and sounds natural.
Confirm that the {second_lang} word reflects the meaning of the {first_lang} word. It is acceptable if the word appears in a different form (e.g., conjugated, pluralized) or is replaced by a synonym, as long as it aligns with the intended meaning and fits naturally within the sentence.
Prioritize naturalness, clarity, and correctness when suggesting fixes.

**Output**: Respond in JSON format as follows:
{{
    "englishSentenceCorrectness": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested corrected sentence if there is an issue, or null if not applicable."
    }},
    "wordUsage": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFixSentence": "Suggested corrected sentence if the word usage is incorrect, or null if not applicable.",
    "suggestedFixWord": "Suggested corrected word if the word usage is incorrect, or null if not applicable."
    }},
    "wordTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }},
    "sentenceTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }}
}}

**Input Example**:

Word in {first_lang}: {src_word}
Word in {second_lang}: {tr_word}
Sentence in {first_lang}: {src_sentence}
Sentence in {second_lang}: {tr_sentence}

**Notes**:

The word does not have to appear in its given form as long as it fits naturally and conveys the intended meaning.
The sentence does not have to be an exact translation of the {first_lang} sentence; preserving the meaning and sounding natural are the main priorities.
Provide detailed explanations and actionable corrections wherever applicable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

In [5]:
def check_eval(eval_result, is_phrase_card):
    if is_phrase_card:
        return eval_result['translationAccuracy']['isCorrect']
    
    w_tr_accuracy = eval_result['wordTranslationAccuracy']['isCorrect']
    s_tr_accuracy = eval_result['sentenceTranslationAccuracy']['isCorrect']
    s_accuracy = eval_result['englishSentenceCorrectness']['isCorrect']
    w_accuracy = eval_result['wordUsage']['isCorrect']
    
    return all([w_tr_accuracy, s_tr_accuracy, s_accuracy, w_accuracy])

In [6]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]

In [7]:
first_file = file_names[0]
first_file

'sm1_new_kap1.json'

In [8]:
ru_finn_cards = get_ru_finn_cards_from_file(first_file)

In [9]:
def is_phrase_card(card):
    return card['sentenceFirstLang'] == ''

In [10]:
eval_res = []
eval_acc = []
for card in tqdm(ru_finn_cards):
    cur_eval = {}
    if is_phrase_card(card):
        cur_eval = eval_phrase_card(card)
    else:
        cur_eval = eval_full_card(card)
    
    eval_res.append(cur_eval)
    eval_acc.append(check_eval(cur_eval, is_phrase_card(card)))

  0%|          | 0/151 [00:00<?, ?it/s]

In [11]:
print(first_file, ' accuracy ', sum(eval_acc) / len(eval_acc))

sm1_new_kap1.json  accuracy  0.9337748344370861


In [12]:
for file_name in tqdm(file_names[1:]):
    ru_finn_cards = get_ru_finn_cards_from_file(file_name)

    eval_res = []
    eval_acc = []
    for card in tqdm(ru_finn_cards):
        cur_eval = {}
        if is_phrase_card(card):
            cur_eval = eval_phrase_card(card)
        else:
            cur_eval = eval_full_card(card)
        
        eval_res.append(cur_eval)
        eval_acc.append(check_eval(cur_eval, is_phrase_card(card)))
    
    print(file_name, ' accuracy ', sum(eval_acc) / len(eval_acc))

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

sm1_new_kap2.json  accuracy  0.7760416666666666


  0%|          | 0/238 [00:00<?, ?it/s]

sm1_new_kap3.json  accuracy  0.9033613445378151


  0%|          | 0/267 [00:00<?, ?it/s]

sm1_new_kap4.json  accuracy  0.8838951310861424


  0%|          | 0/153 [00:00<?, ?it/s]

sm1_new_kap5.json  accuracy  0.9084967320261438


  0%|          | 0/239 [00:00<?, ?it/s]

sm1_new_kap6.json  accuracy  0.9163179916317992


  0%|          | 0/184 [00:00<?, ?it/s]

sm1_new_kap7.json  accuracy  0.9130434782608695


  0%|          | 0/260 [00:00<?, ?it/s]

sm1_new_kap8.json  accuracy  0.8846153846153846


  0%|          | 0/226 [00:00<?, ?it/s]

sm1_new_kap9.json  accuracy  0.9026548672566371
