In [1]:
from google.auth import default
credentials, project = default()

In [14]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from dotenv import load_dotenv
load_dotenv()

from src.utils import (load_json, get_first_n_items, save_json, save_text_file, load_text_file, clean_filename, read_from_gcs, upload_to_gcs)  # noqa: E402
from src.config_loader import config
from src.translation import review_translations_with_anthropic, process_translations_in_batches
config.TARGET_LANGUAGE_NAME

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'French'

# Translation Refinement

## Issues with Google Translate

It can sound stiff / formal and be more related to written text rather than verbal.

## Approach

Use Sonnet 3.5 via the Anthropic API, using a tool to return adjusted translations in JSON format for re-uploading to our phrase translation store (json file)

In [16]:
# get current translated JSON file

phrase_translations = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET, file_path=f"collections/LM1000/translations/{config.TARGET_LANGUAGE_NAME.lower()}.json")

In [17]:
test_dict = get_first_n_items(phrase_translations, 3)

In [18]:
test_dict

{'he_might_offer_to_help_with_the_work': {'english': 'He might offer to help with the work',
  'french': "Il pourrait proposer d'aider au travail"},
 'i_know_exactly_what_youre_talking_about': {'english': "I know exactly what you're talking about",
  'french': 'Je sais exactement de quoi tu parles'},
 'i_suppose_we_should_leave_now_right': {'english': 'I suppose we should leave now, right?',
  'french': "Je suppose que nous devrions partir maintenant, n'est-ce pas ?"}}

In [19]:
improved_translations = process_translations_in_batches(test_dict, model = "haiku-3-5-latest")

Processing translation batches: 100%|██████████| 1/1 [00:00<00:00,  5.58it/s]

Processing batch 1/1
API call error: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}}
Updated 0 translations out of 0





In [21]:
import os
from dotenv import load_dotenv
load_dotenv()
os.getenv("ANTHROPIC_API_KEY")

'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'

In [13]:
improved_translations

{'he_might_offer_to_help_with_the_work': {'english': 'He might offer to help with the work',
  'french': 'Il pourrait proposer d&#39;aider au travail'},
 'i_know_exactly_what_youre_talking_about': {'english': "I know exactly what you're talking about",
  'french': 'Je sais exactement de quoi tu parles'},
 'i_suppose_we_should_leave_now_right': {'english': 'I suppose we should leave now, right?',
  'french': 'Je suppose que nous devrions partir maintenant, n&#39;est-ce pas ?'}}

### Translations

In [5]:
# get the LM1000.json() file

LM1000 = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = "collections/LM1000/LM1000-with-stories.json")


Config file has been modified. Reloading...
setting voice override: es-ES-Neural2-A
setting voice override: es-ES-Chirp-HD-D


In [36]:
config.TARGET_LANGUAGE_NAME

'German'

We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [37]:
from src.translation import translate_phrases
language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

results = dict()
for story in LM1000:
    # get all translations from a list
    english_phrases = LM1000[story]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english" : phrase,
                                language_name_lower : translation}


In [38]:
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=f"collections/LM1000/translations/{language_name_lower}.json")

'gs://andy7475-audio-language-trainer/collections/LM1000/translations/german.json'

In [39]:
results

{'he_might_offer_to_help_with_the_work': {'english': 'He might offer to help with the work',
  'german': 'Er könnte anbieten, bei der Arbeit zu helfen'},
 'i_know_exactly_what_youre_talking_about': {'english': "I know exactly what you're talking about",
  'german': 'Ich weiß genau, wovon du sprichst'},
 'i_suppose_we_should_leave_now_right': {'english': 'I suppose we should leave now, right?',
  'german': 'Ich schätze, wir sollten jetzt gehen, oder?'},
 'lets_call_for_help_the_car_broke_down': {'english': "Let's call for help - the car broke down",
  'german': 'Rufen wir um Hilfe - das Auto ist kaputt'},
 'please_answer_all_questions_honestly': {'english': 'Please answer all questions honestly',
  'german': 'Bitte beantworten Sie alle Fragen ehrlich'},
 'ill_definitely_note_your_concerns_about_this': {'english': "I'll definitely note your concerns about this.",
  'german': 'Ich werde Ihre diesbezüglichen Bedenken auf jeden Fall zur Kenntnis nehmen.'},
 'oh_no_i_dropped_the_glass_and_it

## How to get a translation from the dictionary

In [33]:
translation_lookup = results

example_phrase = LM1000['story_a_fishing_trip'][5]

translation_lookup[clean_filename(example_phrase)]

{'english': 'It might rain later, bring an umbrella',
 'spanish': 'Puede que llueva más tarde, lleva un paraguas.'}