In [1]:
from google.auth import default
credentials, project = default()

In [2]:
%load_ext autoreload
%autoreload 2
import os
import sys
from tqdm import tqdm
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.convert import ( clean_filename)  
from src.gcs_storage import read_from_gcs, upload_to_gcs, get_translated_phrases_path
from src.config_loader import config
config.TARGET_LANGUAGE_NAME

'Swedish'

# GCS Phrase Storage

## System Overview

The goal is to store phrases and their sequence within stories, ensuring minimal introduction of new words between phrases. This approach is crucial for maintaining linguistic consistency and learning efficiency.

## Phrase Sequence Management

Key principles:
- Preserve the original order of phrases in stories
- Track the context and progression of language learning
- Minimise cognitive load by reducing new word introductions

## Translation Storage

### Design Rationale

We propose a robust translation dictionary with the following characteristics:
- English phrases as primary keys
- Multilingual translations stored as nested dictionary
- Flexible structure allowing future refinements

### Proposed Dictionary Structure

```python
{
    "clean_filename_key": {
        "english": "hello",
        "french": "bonjour"
    }
}
```

### Advantages
- Rapid lookup capabilities
- Easy to download and update
- Supports iterative translation improvements
- Language-agnostic design
- Scalable across multiple language resources


How to handle audio segments in dialogue:
```json
{
  "introduction": {
    "title": "Introduction",
    "complete_audio": "story_mystery_introduction_complete.mp3",
    "fast_audio": "story_mystery_introduction_complete_fast.mp3",
    "utterances": [
      {
        "speaker": "Alex",
        "text": "Bonjour, comment vas-tu aujourd'hui?",
        "audio_file": "story_mystery_introduction_alex_0.mp3",
        "original_text": "Hello, how are you today?"
      },
      {
        "speaker": "Sam",
        "text": "Je vais bien, merci!",
        "audio_file": "story_mystery_introduction_sam_0.mp3",
        "original_text": "I'm doing well, thanks!"
      }
    ]
  },
  "development": {
    "title": "Development",
    "complete_audio": "story_mystery_development_complete.mp3",
    "utterances": [
      // More dialogue...
    ]
  }
}```

### Translations

In [3]:
# get the LM1000.json() file
from src.gcs_storage import get_story_collection_path
LM1000 = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = get_story_collection_path(collection="LM1000"))
translated_phrases_path = get_translated_phrases_path(collection="LM1000")


We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [5]:
from src.translation import translate_phrases
language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

results = dict()
for story in tqdm(LM1000, desc="translating stories"):
    # Extract just the phrases from the story's phrase list
    english_phrases = [item['phrase'] for item in LM1000[story]]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english": phrase,
                             language_name_lower: translation}


translating stories: 100%|██████████| 20/20 [00:38<00:00,  1.94s/it]


In [39]:
results

In [6]:
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_phrases_path)

'gs://audio-language-trainer-private-content/collections/LM1000/swedish/translations.json'

## How to get a translation from the dictionary

In [None]:
translation_lookup = results

example_phrase = LM1000['story_a_fishing_trip'][5]

translation_lookup[clean_filename(example_phrase)]