In [None]:
from google.auth import default
credentials, project = default()

In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from dotenv import load_dotenv
load_dotenv()

from src.utils import (load_json, get_first_n_items, save_json, save_text_file, load_text_file, clean_filename, read_from_gcs, upload_to_gcs)  # noqa: E402
from src.config_loader import config
from src.translation import review_translations_with_anthropic, process_translations_in_batches
config.TARGET_LANGUAGE_NAME

# Translation Refinement

## Issues with Google Translate

It can sound stiff / formal and be more related to written text rather than verbal.

## Approach

Use Sonnet 3.5 via the Anthropic API, using a tool to return adjusted translations in JSON format for re-uploading to our phrase translation store (json file)

In [None]:
# get current translated JSON file

phrase_translations = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET, file_path=f"collections/LM1000/translations/{config.TARGET_LANGUAGE_NAME.lower()}.json")

In [None]:
test_dict = get_first_n_items(phrase_translations, 3)

In [None]:
test_dict

In [None]:
improved_translations = process_translations_in_batches(test_dict, model = "haiku-3-5-latest")

In [None]:
improved_translations

### Translations

In [None]:
# get the LM1000.json() file

LM1000 = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = "collections/LM1000/LM1000-with-stories.json")


In [None]:
config.TARGET_LANGUAGE_NAME

We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [None]:
from src.translation import translate_phrases
language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

results = dict()
for story in LM1000:
    # get all translations from a list
    english_phrases = LM1000[story]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english" : phrase,
                                language_name_lower : translation}


In [None]:
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=f"collections/LM1000/translations/{language_name_lower}.json")

In [None]:
results

## How to get a translation from the dictionary

In [None]:
translation_lookup = results

example_phrase = LM1000['story_a_fishing_trip'][5]

translation_lookup[clean_filename(example_phrase)]