In [5]:
#!/usr/bin/env python
# coding: utf-8
import openai
from openai import AzureOpenAI, AsyncAzureOpenAI
from tqdm.notebook import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import requests
import base64
import asyncio
from tqdm.asyncio import tqdm_asyncio


# create ".env" file and put the line with the key in it
# OPENAI_API_KEY="<get_your_key_from_platform.openai.com"
def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = get_openai_api_key()
# client = openai.OpenAI(api_key=OPENAI_API_KEY)  # Replace "YOUR_API_KEY" with your actual OpenAI API key

azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
client = AsyncAzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4_TURBO_PREVIEW = "gpt-4-turbo-preview"
GPT_4 = 'gpt-4'
GPT_4o = 'gpt-4o'

In [4]:
%pip install requests

Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl.metadata (34 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Downloading charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl (102 kB)
Downloading urllib3-2.2.3-py3-none-any.whl (126 kB)
Installing collected packages: urllib3, charset-normalizer, requests
Successfully installed charset-normalizer-3.4.0 requests-2.32.3 urllib3-2.2.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
async def generate_english_sentences_async(words, system_prompt, model="GPT_4o"):
    word_to_sentence = {}
    
    async def fetch_sentence(word):
        user_prompt = f"""Generate an idiomatic, simple, useful sentence in English containing this word or phrase:
        
        === 
        '{word}'
        ===
         
        The sentence should be useful and easy to understand for someone who just started learning English. Provide only the English sentence, without any translation or additional information.
        """
        delay = 1
        while True:
            try:
                response = await client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ]
                )
                break
            except Exception:
                await asyncio.sleep(delay)
                continue
        
        
        sentence = response.choices[0].message.content if response.choices else 'No response generated.'
        word_to_sentence[word] = sentence.strip()
    
    tasks = [fetch_sentence(word) for word in words]
    await tqdm_asyncio.gather(*tasks, desc="Generating sentence examples")
    
    return word_to_sentence

In [7]:
system_prompt = '''You are generator of simple and usefull sentences.'''
await generate_english_sentences_async(["Apple", "Cat", "Brown", "Red", "Basic", "Train", "Forget", "Class", "School", "Last", "Stop talkng", "Nothing", "Justin"], system_prompt, model=GPT_4o)

Generating sentence examples: 100%|██████████| 13/13 [00:00<00:00, 14.90it/s]


{'Justin': 'Justin is always on time for class.',
 'Brown': 'The dog has soft, brown fur.',
 'Last': 'Save the last piece of cake for me, please.',
 'Class': 'I have an English class every Monday and Wednesday.',
 'Apple': 'She eats an apple every morning for breakfast.',
 'School': 'I go to school every morning.',
 'Train': 'I take the train to work every morning.',
 'Forget': "Don't forget to lock the door.",
 'Red': 'The red apple is on the table.',
 'Stop talkng': 'Please stop talking during the movie.',
 'Basic': 'Learning basic English words is important.',
 'Nothing': 'Nothing is impossible if you try your best.',
 'Cat': 'The cat is sitting on the windowsill.'}

In [37]:
import json


FOLDER_WITH_JSON = '../../data/russian-finnish/cards/curated_platform_cards/'
def load_cards_from_file(file_name):
    data = []
    file_path = FOLDER_WITH_JSON + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [38]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]

first_chapter = []
for file_name in tqdm(file_names):
    first_chapter.extend(load_cards_from_file(file_name))

  0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
client = AsyncAzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)



In [40]:
import os
import asyncio
import json
from tqdm.asyncio import tqdm_asyncio
from asyncio import Queue

client = AsyncAzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

async def evaluate_cards_async(cards, first_lang='Russian', second_lang='Finnish', model="GPT_4o"):
    results = Queue()
    
    async def evaluate_card(card):
        system_prompt = f"""
        You are a language expert proficient in both {first_lang} and {second_lang}."""

        user_prompt = f"""
        You are given a flashcard containing a word and a sentence in {first_lang}, along with its translation and 
        a sentence in {second_lang}. Your task is to evaluate the correctness of the flashcard. Here’s what you need to check:

        1. Whether the sentence in {first_lang} is grammatically correct and natural.
        2. Whether the sentence in {second_lang} is grammatically correct and natural.
        3. Whether the translation from {second_lang} to {first_lang} is accurate.
        4. Whether the word in {first_lang} occurs in some form in the {first_lang} sentence.
        5. Whether the word in {second_lang} occurs in some form in the {second_lang} sentence.

        If both sentence in {first_lang} and {second_lang} are missing it is ok, just check the translation to {second_lang}. 

        Check that the word in {first_lang} appear in the sentence in some form, disregarding the case and form of the word, or the order of the words, 
        if the words in {first_lang} consists of several words. For example, if the word is 'в то же время' and in the sentence it's 'в одно и то же время', it's ok. Or if 'когда опять' in the words and 'когда моя сестра опять' is in the sentence. 
        If there are several synonyms in {first_lang} word field, then if one of them appears in the sentences in some form it's ok. 
        For example, if {first_lang} word is 'выносить, красть' and in the sentence 'украсть' is used, it's ok.

        If the word doesn't appear in some form, suggest using synonyms or related terms in the 
        translation to ensure the sentence remains natural and accurate. You can change either the sentence translation 
        or the word translation. For example, if the word is "досуг," it can be translated as "досуг" or "свободное время". So if the word "свободное время" 
        is in some form used in the sentence and it sounds natural, suggested fix can be to change the word translation to "досуг, свободное время", 
        keeping the sentence intact.

        In the suggestedFix fields, don't provide explanations or instructions, just provide the final corrected string.
        If it's better to fix the word, not the sentence, return null for suggestedFixSentence.

        Provide a detailed evaluation for each point and suggest fixes where necessary.

        Here is the flashcard:

        Word in {first_lang}: {card['wordFirstLang']}
        Sentence in {first_lang}: {card['sentenceFirstLang']}
        Word in {second_lang}: {card['wordSecondLang']}
        Sentence in {second_lang}: {card['sentenceSecondLang']}

        Respond in JSON format with the following structure:

        {{
          "{first_lang}Sentence": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct",
            "suggestedFix": "Suggested correction if there is an issue"
          }},
          "{second_lang}Sentence": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct",
            "suggestedFix": "Suggested correction if there is an issue"
          }},
          "translationAccuracy": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct",
            "suggestedFix": "Suggested correction if there is an issue"
          }},
          "{first_lang}WordUsage": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct",
            "suggestedFixSentence": "Suggested correction to the sentence if there is an issue",
            "suggestedFixWord": "Suggested correction to the word if there is an issue"
          }},
          "{second_lang}WordUsage": {{
            "isCorrect": true/false,
            "explanation": "Detailed explanation if there is an issue or why it's correct",
            "suggestedFix": "Suggested correction if there is an issue"
          }}
        }}

        If both sentences in {first_lang} and {second_lang} are missing, return only the translationAccuracy block.
        """

        delay = 1
        while True:
            try:
                response = await client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ]
                )
                await results.put(json.loads(response.choices[0].message.content.strip()))
                break
            except Exception:
                await asyncio.sleep(delay)
                delay += 1
                continue
    
    # Create tasks for each card to evaluate them concurrently
    tasks = [evaluate_card(card) for card in cards]
    await tqdm_asyncio.gather(*tasks, desc="Evaluating flashcards")
    
    final_results = []
    while not results.empty():
        final_results.append(await results.get())

    return final_results

# Example usage
# asyncio.run(evaluate_cards_async(cards, first_lang, second_lang, model))


In [41]:
results = await evaluate_cards_async(first_chapter[:100])


Evaluating flashcards:   0%|                                                                   | 0/100 [00:00<?, ?it/s][A
Evaluating flashcards:   1%|▌                                                          | 1/100 [00:23<39:22, 23.86s/it][A
Evaluating flashcards:   3%|█▊                                                         | 3/100 [00:23<10:03,  6.22s/it][A
Evaluating flashcards:   5%|██▉                                                        | 5/100 [00:25<05:34,  3.52s/it][A
Evaluating flashcards:   7%|████▏                                                      | 7/100 [01:28<23:48, 15.36s/it][A
Evaluating flashcards:   9%|█████▎                                                     | 9/100 [01:28<14:32,  9.59s/it][A
Evaluating flashcards:  11%|██████▍                                                   | 11/100 [01:29<09:17,  6.27s/it][A
Evaluating flashcards:  13%|███████▌                                                  | 13/100 [01:31<06:40,  4.60s/it][A
Evaluating flas

Evaluating flashcards:  92%|█████████████████████████████████████████████████████▎    | 92/100 [12:57<01:19,  9.91s/it][A
Evaluating flashcards:  93%|█████████████████████████████████████████████████████▉    | 93/100 [12:58<00:52,  7.55s/it][A
Evaluating flashcards:  94%|██████████████████████████████████████████████████████▌   | 94/100 [13:56<02:09, 21.63s/it][A
Evaluating flashcards:  95%|███████████████████████████████████████████████████████   | 95/100 [13:56<01:17, 15.49s/it][A
Evaluating flashcards:  96%|███████████████████████████████████████████████████████▋  | 96/100 [13:57<00:44, 11.12s/it][A
Evaluating flashcards:  97%|████████████████████████████████████████████████████████▎ | 97/100 [13:58<00:24,  8.16s/it][A
Evaluating flashcards:  98%|████████████████████████████████████████████████████████▊ | 98/100 [13:59<00:12,  6.05s/it][A
Evaluating flashcards: 100%|█████████████████████████████████████████████████████████| 100/100 [14:57<00:00,  8.97s/it][A


In [42]:
len(results)

100

In [43]:
sum([res['translationAccuracy']['isCorrect'] for res in results]) / len([res['translationAccuracy']['isCorrect'] for res in results])

0.99

In [46]:
[res['translationAccuracy'] for res in results if not res['translationAccuracy']['isCorrect']]

[{'isCorrect': False,
  'explanation': "The translation is not accurate because 'среда' means 'Wednesday' in English, not 'keskiviikko'. 'Keskiviikko' is the translation for 'Wednesday' in Finnish.",
  'suggestedFix': 'Reset the translation to Russian: среда – Finnish: keskiviikko or Russian: среда – Finnish: ympäristö if it means ’environment’'}]

In [1]:
def eval_full_card(card, model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = '''You are a multilingual assistant who is proficient in Russian and English.'''

    user_prompt = f"""
**Task**: Evaluate the correctness of an English word and sentence based on their translations from Russian. Assess the following:

The quality and naturalness of the English sentence.
The accuracy of the word’s translation and its usage in the sentence.
The overall accuracy of the translations for both the word and sentence from Russian to English.

**Evaluation Points**:

Verify that the English sentence is grammatically correct, natural, and includes the English word appropriately. Provide corrections if necessary.
Confirm that both the word and sentence translations are accurate. Suggest improvements where needed.
Prioritize naturalness, clarity, and correctness in all suggestions.

**Output**: Respond in JSON format as follows:
{{
    "englishSentenceCorrectness": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested corrected sentence if there is an issue, or null if not applicable."
    }},
    "wordUsage": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFixSentence": "Suggested corrected sentence if the word usage is incorrect, or null if not applicable.",
    "suggestedFixWord": "Suggested corrected word if the word usage is incorrect, or null if not applicable."
    }},
    "wordTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }},
    "sentenceTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }}
}}

**Input Example**:

Word in Russian: {src_word}
Word in English: {tr_word}
Sentence in Russian: {src_sentence}
Sentence in English: {tr_sentence}
Note: Provide detailed explanations and actionable corrections wherever applicable. Ensure that all feedback adheres to this format for clarity and usability.
    """

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of an English word and sentence based on their translations from Russian. Assess the following:

The quality and naturalness of the English sentence.
The accuracy of the word’s translation and its usage in the sentence.
The overall accuracy of the translations for both the word and sentence from Russian to English.

**Evaluation Points**:

Verify that the English sentence is grammatically correct, natural, and conveys the meaning of the Russian sentence. The sentence does not need to be an exact translation as long as it preserves the original meaning and sounds natural.
Confirm that the English word reflects the meaning of the Russian word. It is acceptable if the word appears in a different form (e.g., conjugated, pluralized) or is replaced by a synonym, as long as it aligns with the intended meaning and fits naturally within the sentence.
Prioritize naturalness, clarity, and correctness when suggesting fixes.

**Output**: Respond in JSON format as follows:
{{
    "englishSentenceCorrectness": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested corrected sentence if there is an issue, or null if not applicable."
    }},
    "wordUsage": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFixSentence": "Suggested corrected sentence if the word usage is incorrect, or null if not applicable.",
    "suggestedFixWord": "Suggested corrected word if the word usage is incorrect, or null if not applicable."
    }},
    "wordTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }},
    "sentenceTranslationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct.",
    "suggestedFix": "Suggested correction for translation issues, or null if not applicable."
    }}
}}

**Input Example**:

Word in Russian: {src_word}
Word in English: {tr_word}
Sentence in Russian: {src_sentence}
Sentence in English: {tr_sentence}

**Notes**:

The word does not have to appear in its given form as long as it fits naturally and conveys the intended meaning.
The sentence does not have to be an exact translation of the Russian sentence; preserving the meaning and sounding natural are the main priorities.
Provide detailed explanations and actionable corrections wherever applicable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

NameError: name 'GPT_4o' is not defined

In [2]:
def eval_phrase_card(card, model=GPT_4o):
    src_word, src_sentence, tr_word, tr_sentence, tr_id = card.values()

    system_prompt = '''You are a multilingual assistant who is proficient in Russian and English.'''

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of English translations for a given Russian word or phrase. Confirm if:

The English translation accurately conveys the meaning and context of the Russian version.
The English translation sounds natural to a native speaker.

**Guidelines**:

Your evaluation does not need to suggest the best possible translation, only confirm if it is acceptable or identify issues if present.
If corrections are needed, provide only the final corrected English translation.
If no correction is needed, set suggestedFix to null.

**Input Example**:

Word or Phrase in Russian: {src_word}
Word or Phrase in English: {tr_word}

**Output Format**: Respond in JSON using the structure below:
 
{{
  "translationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct",
    "suggestedFix": "Suggested correction if there is an issue or null if no correction is needed"
  }}
}}

**Note**: Ensure all explanations are clear and actionable.
    """

    user_prompt = f"""
**Task**: Evaluate the correctness and naturalness of English translations for a given Russian word or phrase. Confirm the following:

The English translation accurately conveys the meaning and context of the Russian version.
The English translation sounds natural to a native speaker.

**Guidelines**:

The translated word or phrase does not need to be the most common or best possible translation of the Russian version. As long as it retains the original meaning and sounds natural and correct, it is acceptable.
Your evaluation should focus on whether the translation is acceptable or identify any issues if present.
If corrections are needed, provide only the final corrected English translation.
If no correction is needed, set suggestedFix to null.

**Input Example**:

Word or Phrase in Russian: {src_word}
Word or Phrase in English: {tr_word}

**Output Format**: Respond in JSON using the structure below:
 
{{
  "translationAccuracy": {{
    "isCorrect": true/false,
    "explanation": "Detailed explanation if there is an issue or why it's correct",
    "suggestedFix": "Suggested correction if there is an issue or null if no correction is needed"
  }}
}}

**Notes**:

The English translation does not have to be the most commonly used or the best translation, as long as it preserves the meaning of the Russian word or phrase.
Prioritize naturalness, correctness, and clarity when evaluating the translation.
Ensure all explanations are detailed, clear, and actionable.
    """

    response = client.chat.completions.create(
      model=model,
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
      ]
    )

    res = json.loads(response.choices[0].message.content.strip())
    res['id'] = tr_id

    return res

NameError: name 'GPT_4o' is not defined