In [1]:
from google.auth import default
credentials, project = default()

In [11]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils import (load_json, save_json, save_text_file, load_text_file, clean_filename, read_from_gcs, upload_to_gcs)  # noqa: E402
from src.config_loader import config
PAY_FOR_API = True #change to True to run cells that cost money via API calls

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# GCS Phrase Storage

## System Overview

The goal is to store phrases and their sequence within stories, ensuring minimal introduction of new words between phrases. This approach is crucial for maintaining linguistic consistency and learning efficiency.

## Phrase Sequence Management

Key principles:
- Preserve the original order of phrases in stories
- Track the context and progression of language learning
- Minimise cognitive load by reducing new word introductions

## Translation Storage

### Design Rationale

We propose a robust translation dictionary with the following characteristics:
- English phrases as primary keys
- Multilingual translations stored as nested dictionary
- Flexible structure allowing future refinements

### Proposed Dictionary Structure

```python
{
    "clean_filename_key": {
        "english": "hello",
        "french": "bonjour"
    }
}
```

### Advantages
- Rapid lookup capabilities
- Easy to download and update
- Supports iterative translation improvements
- Language-agnostic design
- Scalable across multiple language resources


How to handle audio segments in dialogue:
```json
{
  "introduction": {
    "title": "Introduction",
    "complete_audio": "story_mystery_introduction_complete.mp3",
    "fast_audio": "story_mystery_introduction_complete_fast.mp3",
    "utterances": [
      {
        "speaker": "Alex",
        "text": "Bonjour, comment vas-tu aujourd'hui?",
        "audio_file": "story_mystery_introduction_alex_0.mp3",
        "original_text": "Hello, how are you today?"
      },
      {
        "speaker": "Sam",
        "text": "Je vais bien, merci!",
        "audio_file": "story_mystery_introduction_sam_0.mp3",
        "original_text": "I'm doing well, thanks!"
      }
    ]
  },
  "development": {
    "title": "Development",
    "complete_audio": "story_mystery_development_complete.mp3",
    "utterances": [
      // More dialogue...
    ]
  }
}```

### Translations

In [5]:
# get the LM1000.json() file

LM1000 = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = "collections/LM1000/LM1000-with-stories.json")


Config file has been modified. Reloading...
setting voice override: es-ES-Neural2-A
setting voice override: es-ES-Chirp-HD-D


In [29]:
config.TARGET_LANGUAGE_NAME

Config file has been modified. Reloading...
setting voice override: fr-FR-Neural2-G
setting voice override: fr-FR-Neural2-G


'Spanish'

We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [30]:
from src.translation import translate_phrases
language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

results = dict()
for story in LM1000:
    # get all translations from a list
    english_phrases = LM1000[story]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english" : phrase,
                                language_name_lower : translation}


In [31]:
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=f"collections/LM1000/translations/{language_name_lower}.json")

'gs://andy7475-audio-language-trainer/collections/LM1000/translations/spanish.json'

In [32]:
results

{'he_might_offer_to_help_with_the_work': {'english': 'He might offer to help with the work',
  'spanish': 'Podría ofrecerse a ayudar con el trabajo.'},
 'i_know_exactly_what_youre_talking_about': {'english': "I know exactly what you're talking about",
  'spanish': 'Sé exactamente de qué estás hablando.'},
 'i_suppose_we_should_leave_now_right': {'english': 'I suppose we should leave now, right?',
  'spanish': 'Supongo que deberíamos irnos ahora, ¿no?'},
 'lets_call_for_help_the_car_broke_down': {'english': "Let's call for help - the car broke down",
  'spanish': 'Pidamos ayuda: el coche se averió.'},
 'please_answer_all_questions_honestly': {'english': 'Please answer all questions honestly',
  'spanish': 'Por favor responda todas las preguntas honestamente.'},
 'ill_definitely_note_your_concerns_about_this': {'english': "I'll definitely note your concerns about this.",
  'spanish': 'Definitivamente tomaré nota de tus preocupaciones sobre esto.'},
 'oh_no_i_dropped_the_glass_and_it_brok

## How to get a translation from the dictionary

In [33]:
translation_lookup = results

example_phrase = LM1000['story_a_fishing_trip'][5]

translation_lookup[clean_filename(example_phrase)]

{'english': 'It might rain later, bring an umbrella',
 'spanish': 'Puede que llueva más tarde, lleva un paraguas.'}