In [1]:
import tiktoken
import openai
from openai import AzureOpenAI, AsyncAzureOpenAI
from tqdm.notebook import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import requests
import base64

GPT_3_5_TURBO = "gpt-3.5-turbo"
GPT_4_TURBO_PREVIEW = "gpt-4-turbo-preview"
GPT_4 = 'gpt-4'
GPT_4o = 'gpt-4o'

def get_openai_api_key():
    _ = load_dotenv(find_dotenv())

    return os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = get_openai_api_key()
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-07-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [2]:
import json


EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def get_eval_results_from_file(file_name):
    data = []
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [3]:
import json


OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def get_cards_from_file(file_name):
    data = []
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [4]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 10)]

In [6]:
cards_1 = get_cards_from_file(file_names[0])
eval_results_1 = get_eval_results_from_file(file_names[0])

In [25]:
def improve_only_phrase_cards(basic_cards, basic_eval_results, system_prompt, first_lang='Russian', second_lang='English', model=GPT_4o):
    cards = []
    eval_results = []
    for basic_card, basic_eval_result in tqdm(list(zip(basic_cards, basic_eval_results))):
        try:
            word, sentence, _, _, card_id = basic_card.values()
            if sentence != '':
                cards.append(basic_card)
                eval_results.append(basic_eval_result)
                continue

            user_prompt = f'''You will get simple word or phrase on Russian. Your task is to translate this word or phrase into English.
            You don't need to translate the phrase literally. You need to translate it in such a way that the meaning of the phrase is preserved and the translation sounds natural.

            === 
            Given word or phrase: '{word}'
            ===


            Return only english translated word or phrase and nothing else.
            '''

            response = client.chat.completions.create(
              model=model,
              messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
              ]
            )
            
            tr_word = response.choices[0].message.content

            card = {}
            card["wordFirstLang"] = word
            card["sentenceFirstLang"] = ""
            card["wordSecondLang"] = tr_word.strip()
            card["sentenceSecondLang"] = ""
            card["id"] = card_id
            cards.append(card)
            
            user_prompt = f"""
            You are given a word or phrase in {first_lang}, along with its translation in {second_lang}. Your task is to evaluate the correctness of the translation.

            You have to check whether the translation from {second_lang} to {first_lang} is accurate.

            If the translation isn't accurate then suggest fix with explanation why you considered it as a mistake.




            In the suggestedFix fields don't provide explanations or instructions, just provide the final corrected string.
            If it's better to fix word, not sentence, return null for suggestedFixSentence.


            Provide a detailed evaluation for each point and suggest fixes where necessary.

            Here is the words or phrases:
            
            ======
            Word in {first_lang}: {card['wordFirstLang']}
            ======
            
            ======
            Word in {second_lang}: {card['wordSecondLang']}
            ======

            Respond in JSON format with the following structure:

            {{
              "translationAccuracy": {{
                "isCorrect": true/false,
                "explanation": "Detailed explanation if there is an issue or why it's correct",
                "suggestedFix": "Suggested correction if there is an issue"
              }}
            }}
            """
            
            response = client.chat.completions.create(
              model="gpt-4o",
              response_format={ "type": "json_object" },
              messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
              ]
            )

            res = json.loads(response.choices[0].message.content.strip())
            res['id'] = card['id']
            eval_results.append(res)
            
        except Exception as e:
            print(e.message)
    return cards, eval_results

In [26]:
system_prompt = '''You are the Russian to English translater for beginners.'''
lol, kek = improve_only_phrase_cards(cards_1[:22], eval_results_1[:22], system_prompt)

  0%|          | 0/22 [00:00<?, ?it/s]

In [27]:
lol[21], kek[21]

({'wordFirstLang': 'Спокойной ночи! Спи крепко!',
  'sentenceFirstLang': '',
  'wordSecondLang': 'Good night! Sleep tight!',
  'sentenceSecondLang': '',
  'id': 1021},
 {'translationAccuracy': {'isCorrect': True,
   'explanation': "The translation 'Good night! Sleep tight!' accurately captures the meaning of the Russian phrase 'Спокойной ночи! Спи крепко!' Both convey a similar sentiment often used before sleeping, where 'Спокойной ночи!' translates to 'Good night!' and 'Спи крепко!' translates to 'Sleep tight!'",
   'suggestedFix': None},
  'id': 1021})

In [20]:
import json


OUTPUT_FOLDER = '../../data/russian-english/cards/test_cards/ru_eng_'
def write_cards_to_file(file_name, cards):
    file_path = OUTPUT_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cards, f, ensure_ascii=False, indent=2)

In [21]:
import json


EVAL_FOLDER = '../../data/russian-english/cards/eval_results/ru_eng_'
def write_eval_results_to_file(file_name, results):
    file_path = EVAL_FOLDER + file_name
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [29]:
file_names = [f'sm1_new_kap{i}.json' for i in range(1, 8)]

for file_name in tqdm(file_names):
    ru_cards = get_cards_from_file(file_name)
    eval_results = get_eval_results_from_file(file_name)
    
    system_prompt = '''You are the Russian to English translater for beginners.'''
    final_cards, final_eval_results = improve_only_phrase_cards(ru_cards, eval_results, system_prompt)
    
    write_cards_to_file(file_name, final_cards)
    
    write_eval_results_to_file(file_name, final_eval_results)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

  0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/239 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

In [30]:
file_names = [f'sm1_new_kap{i}.json' for i in range(8, 9)]

for file_name in tqdm(file_names):
    ru_cards = get_cards_from_file(file_name)
    eval_results = get_eval_results_from_file(file_name)
    
    system_prompt = '''You are the Russian to English translater for beginners.'''
    final_cards, final_eval_results = improve_only_phrase_cards(ru_cards, eval_results, system_prompt)
    
    write_cards_to_file(file_name, final_cards)
    
    write_eval_results_to_file(file_name, final_eval_results)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/259 [00:00<?, ?it/s]