In [1]:
from google.auth import default
credentials, project = default()

In [6]:
%load_ext autoreload
%autoreload 2
import os
import sys
from tqdm import tqdm
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.convert import ( clean_filename)  
from src.gcs_storage import read_from_gcs, upload_to_gcs
from src.config_loader import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# GCS Phrase Storage

## System Overview

The goal is to store phrases and their sequence within stories, ensuring minimal introduction of new words between phrases. This approach is crucial for maintaining linguistic consistency and learning efficiency.

## Phrase Sequence Management

Key principles:
- Preserve the original order of phrases in stories
- Track the context and progression of language learning
- Minimise cognitive load by reducing new word introductions

## Translation Storage

### Design Rationale

We propose a robust translation dictionary with the following characteristics:
- English phrases as primary keys
- Multilingual translations stored as nested dictionary
- Flexible structure allowing future refinements

### Proposed Dictionary Structure

```python
{
    "clean_filename_key": {
        "english": "hello",
        "french": "bonjour"
    }
}
```

### Advantages
- Rapid lookup capabilities
- Easy to download and update
- Supports iterative translation improvements
- Language-agnostic design
- Scalable across multiple language resources


How to handle audio segments in dialogue:
```json
{
  "introduction": {
    "title": "Introduction",
    "complete_audio": "story_mystery_introduction_complete.mp3",
    "fast_audio": "story_mystery_introduction_complete_fast.mp3",
    "utterances": [
      {
        "speaker": "Alex",
        "text": "Bonjour, comment vas-tu aujourd'hui?",
        "audio_file": "story_mystery_introduction_alex_0.mp3",
        "original_text": "Hello, how are you today?"
      },
      {
        "speaker": "Sam",
        "text": "Je vais bien, merci!",
        "audio_file": "story_mystery_introduction_sam_0.mp3",
        "original_text": "I'm doing well, thanks!"
      }
    ]
  },
  "development": {
    "title": "Development",
    "complete_audio": "story_mystery_development_complete.mp3",
    "utterances": [
      // More dialogue...
    ]
  }
}```

### Translations

In [18]:
# get the LM1000.json() file
from src.gcs_storage import get_story_collection_path
LM1000 = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = get_story_collection_path())


In [19]:
LM1000

b'{"story_sunset_wedding_blues": [{"phrase": "Don\'t worry, the alarm will sound if there\'s danger", "score": 39.0, "new_story_verbs": 4, "new_story_vocab": 4, "new_global_verbs": 5, "new_global_vocab": 6, "total_new_words": 11}, {"phrase": "Let\'s go to bed early so we feel better tomorrow", "score": 33.0, "new_story_verbs": 3, "new_story_vocab": 5, "new_global_verbs": 3, "new_global_vocab": 8, "total_new_words": 11}, {"phrase": "I know exactly what you\'re talking about", "score": 26.0, "new_story_verbs": 2, "new_story_vocab": 5, "new_global_verbs": 2, "new_global_vocab": 5, "total_new_words": 7}, {"phrase": "Still, I think we should give it a try.", "score": 23.0, "new_story_verbs": 2, "new_story_vocab": 3, "new_global_verbs": 3, "new_global_vocab": 4, "total_new_words": 7}, {"phrase": "Have you seen the feast they prepared for the wedding?", "score": 23.0, "new_story_verbs": 2, "new_story_vocab": 3, "new_global_verbs": 3, "new_global_vocab": 4, "total_new_words": 7}, {"phrase": "I

We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [9]:
from src.translation import translate_phrases
language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

results = dict()
for story in tqdm(LM1000, desc="translating stories"):
    # get all translations from a list
    english_phrases = LM1000[story]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english" : phrase,
                                language_name_lower : translation}


100%|██████████| 57/57 [00:00<00:00, 59147.78it/s]
100%|██████████| 53/53 [00:00<?, ?it/s]
100%|██████████| 81/81 [00:00<00:00, 41624.43it/s]
100%|██████████| 51/51 [00:00<?, ?it/s]
100%|██████████| 59/59 [00:00<00:00, 60697.56it/s]
100%|██████████| 52/52 [00:00<00:00, 26151.54it/s]
100%|██████████| 37/37 [00:00<?, ?it/s]
100%|██████████| 33/33 [00:00<00:00, 32963.09it/s]
100%|██████████| 18/18 [00:00<?, ?it/s]
100%|██████████| 38/38 [00:00<?, ?it/s]
100%|██████████| 50/50 [00:00<?, ?it/s]
100%|██████████| 33/33 [00:00<?, ?it/s]
100%|██████████| 24/24 [00:00<00:00, 23933.26it/s]
100%|██████████| 18/18 [00:00<00:00, 17988.44it/s]
100%|██████████| 35/35 [00:00<00:00, 17528.43it/s]
100%|██████████| 19/19 [00:00<?, ?it/s]
100%|██████████| 49/49 [00:00<00:00, 24572.08it/s]
100%|██████████| 17/17 [00:00<00:00, 16993.13it/s]
100%|██████████| 42/42 [00:00<00:00, 41873.25it/s]
100%|██████████| 26/26 [00:00<00:00, 26008.09it/s]
100%|██████████| 39/39 [00:00<?, ?it/s]


In [10]:
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=f"collections/LM1000/translations/{language_name_lower}.json")

'gs://andy7475-audio-language-trainer/collections/LM1000/translations/french.json'

In [39]:
results

## How to get a translation from the dictionary

In [None]:
translation_lookup = results

example_phrase = LM1000['story_a_fishing_trip'][5]

translation_lookup[clean_filename(example_phrase)]