In [4]:
import openai
from openai import AzureOpenAI, AsyncAzureOpenAI
from tqdm.notebook import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import requests
import base64

GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4_TURBO_PREVIEW = "gpt-4-turbo-preview"
GPT_4 = 'gpt-4'
GPT_4o = 'gpt-4o'

def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = get_openai_api_key()
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [5]:
import json


EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def get_eval_results_from_file(file_name):
    data = []
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


RU_ENG_OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def get_ru_eng_cards_from_file(file_name):
    data = []
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


RU_FINN_FOLDER_WITH_JSON = '../../data/russian-finnish/cards/curated_platform_cards/'
def get_ru_finn_cards_from_file(file_name):
    data = []
    file_path = RU_FINN_FOLDER_WITH_JSON + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    res = []
    for card in data:
        if 'isMarkedDeleted' not in card.keys():
            res.append(card)
    return res


OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def write_cards_to_file(file_name, cards):
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cards, f, ensure_ascii=False, indent=2)
        
        
EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def write_eval_results_to_file(file_name, results):
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [35]:
def eval_phrase_card(card, model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = '''You are a multilingual assistant who is proficient in Russian and English.'''

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of English translations for a given Russian word or phrase. Confirm the following:

The English translation accurately conveys the meaning and context of the Russian version.
The English translation sounds natural to a native speaker.

**Guidelines**:

The translated word or phrase does not need to be the most common or best possible translation of the Russian version. As long as it retains the original meaning and sounds natural and correct, it is acceptable.
Your evaluation should focus on whether the translation is acceptable or identify any issues if present.
If corrections are needed, provide only the final corrected English translation.
If no correction is needed, set suggestedFix to null.

**Input Example**:

Word or Phrase in Russian: {src_word}
Word or Phrase in English: {tr_word}

**Output Format**: Respond in JSON using the structure below:
 
{{
  "translationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct",
    "suggestedFix": "Suggested correction if there is an issue or null if no correction is needed"
  }}
}}

**Notes**:

The English translation does not have to be the most commonly used or the best translation, as long as it preserves the meaning of the Russian word or phrase.
Prioritize naturalness, correctness, and clarity when evaluating the translation.
Ensure all explanations are detailed, clear, and actionable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

In [36]:
def eval_full_card(card, model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = '''You are a multilingual assistant who is proficient in Russian and English.'''

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of an English word and sentence based on their translations from Russian. Assess the following:

The quality and naturalness of the English sentence.
The accuracy of the word’s translation and its usage in the sentence.
The overall accuracy of the translations for both the word and sentence from Russian to English.

**Evaluation Points**:

Verify that the English sentence is grammatically correct, natural, and conveys the meaning of the Russian sentence. The sentence does not need to be an exact translation as long as it preserves the original meaning and sounds natural.
Confirm that the English word reflects the meaning of the Russian word. It is acceptable if the word appears in a different form (e.g., conjugated, pluralized) or is replaced by a synonym, as long as it aligns with the intended meaning and fits naturally within the sentence.
Prioritize naturalness, clarity, and correctness when suggesting fixes.

**Output**: Respond in JSON format as follows:
{{
    "englishSentenceCorrectness": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested corrected sentence if there is an issue, or null if not applicable."
    }},
    "wordUsage": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFixSentence": "Suggested corrected sentence if the word usage is incorrect, or null if not applicable.",
    "suggestedFixWord": "Suggested corrected word if the word usage is incorrect, or null if not applicable."
    }},
    "wordTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }},
    "sentenceTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }}
}}

**Input Example**:

Word in Russian: {src_word}
Word in English: {tr_word}
Sentence in Russian: {src_sentence}
Sentence in English: {tr_sentence}

**Notes**:

The word does not have to appear in its given form as long as it fits naturally and conveys the intended meaning.
The sentence does not have to be an exact translation of the Russian sentence; preserving the meaning and sounding natural are the main priorities.
Provide detailed explanations and actionable corrections wherever applicable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

In [8]:
def check_eval(eval_result, is_phrase_card):
    if is_phrase_card:
        return eval_result['translationAccuracy']['isCorrect']
    
    w_tr_accuracy = eval_result['wordTranslationAccuracy']['isCorrect']
    s_tr_accuracy = eval_result['sentenceTranslationAccuracy']['isCorrect']
    s_accuracy = eval_result['englishSentenceCorrectness']['isCorrect']
    w_accuracy = eval_result['wordUsage']['isCorrect']
    
    return all([w_tr_accuracy, s_tr_accuracy, s_accuracy, w_accuracy])

In [9]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]

In [10]:
first_file = file_names[0]
first_file

'sm1_new_kap1.json'

In [21]:
ru_eng_cards = get_ru_eng_cards_from_file(first_file)
eval_results = get_eval_results_from_file(first_file)

In [15]:
def is_phrase_card(card):
    return card['sentenceFirstLang'] == ''

In [28]:
eval_res = []
eval_acc = []
for card in tqdm(ru_eng_cards):
    cur_eval = {}
    if is_phrase_card(card):
        cur_eval = eval_phrase_card(card)
    else:
        cur_eval = eval_full_card(card)
    
    eval_res.append(cur_eval)
    eval_acc.append(check_eval(cur_eval, is_phrase_card(card)))

  0%|          | 0/151 [00:00<?, ?it/s]

In [34]:
for file_name in tqdm(file_names[1:]):
    ru_eng_cards = get_ru_eng_cards_from_file(file_name)
    eval_results = get_eval_results_from_file(file_name)

    eval_res = []
    eval_acc = []
    for card, eval in tqdm(list(zip(ru_eng_cards, eval_results))):
        if not check_eval(eval, is_phrase_card(card)):
            continue
        cur_eval = {}
        if is_phrase_card(card):
            cur_eval = eval_phrase_card(card)
        else:
            cur_eval = eval_full_card(card)
        
        eval_res.append(cur_eval)
        eval_acc.append(check_eval(cur_eval, is_phrase_card(card)))
    
    print(file_name, ' accuracy ', sum(eval_acc) / len(eval_acc))

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

sm1_new_kap2.json  accuracy  1.0


  0%|          | 0/238 [00:00<?, ?it/s]

sm1_new_kap3.json  accuracy  0.9698275862068966


  0%|          | 0/267 [00:00<?, ?it/s]

sm1_new_kap4.json  accuracy  0.9847908745247148


  0%|          | 0/153 [00:00<?, ?it/s]

sm1_new_kap5.json  accuracy  0.9671052631578947


  0%|          | 0/239 [00:00<?, ?it/s]

sm1_new_kap6.json  accuracy  0.9535864978902954


  0%|          | 0/184 [00:00<?, ?it/s]

sm1_new_kap7.json  accuracy  0.95


  0%|          | 0/260 [00:00<?, ?it/s]

sm1_new_kap8.json  accuracy  0.9728682170542635


  0%|          | 0/226 [00:00<?, ?it/s]

sm1_new_kap9.json  accuracy  0.9727272727272728


In [23]:
eval_acc_2 = np.array([check_eval(eval, is_phrase_card(card)) for eval, card in list(zip(eval_results, ru_eng_cards))])

In [24]:
sum(eval_acc_2) / len(eval_acc_2)

np.float64(1.0)

In [29]:
sum(eval_acc) / len(eval_acc)

0.9867549668874173

In [18]:
import numpy as np
np_res = np.array(eval_res)
np_acc = np.array(eval_acc)

In [20]:
np_res[np_acc == 0]

array([{'englishSentenceCorrectness': {'isCorrect': True, 'explanation': "The English sentence 'Hello! How are you doing?' is grammatically correct and natural. It appropriately uses the English word 'How are you?' in context.", 'suggestedFix': None}, 'wordUsage': {'isCorrect': False, 'explanation': "While 'How are you doing?' is a common and natural expression in English, the word being checked for accuracy is 'How are you?', which should be used in its original form for this particular evaluation.", 'suggestedFixSentence': 'Hello! How are you?', 'suggestedFixWord': 'How are you?'}, 'wordTranslationAccuracy': {'isCorrect': True, 'explanation': "The translation of 'Как ты?' to 'How are you?' is accurate. Both phrases are direct equivalents in meaning.", 'suggestedFix': None}, 'sentenceTranslationAccuracy': {'isCorrect': False, 'explanation': "Although 'Hello! How are you doing?' is natural and idiomatic in English, it is not a literal translation of the Russian sentence 'Привет! Как ты