In [3]:
from google.auth import default
credentials, project = default()

In [4]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from dotenv import load_dotenv
load_dotenv()
from tqdm import tqdm

from src.utils import (load_json, get_first_n_items, list_story_folders, save_json, save_text_file, load_text_file)  # noqa: E402
from src.story import upload_story_image, prepare_dialogue_with_wiktionary
from src.gcs_storage import check_blob_exists, read_from_gcs, upload_to_gcs, get_story_translated_dialogue_path, get_story_dialogue_path
from src.config_loader import config
config.TARGET_LANGUAGE_NAME

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'French'

# Story Data

* based around a story_name which is part of a collection (LM1000 > story_the_birthday_party)
* translated dialogue (text file)
    * comes from google translate
    * added to with Wiktionary links for each utternace ('wiktionary_links')
* audio file for each utterance of Sam and Alex
* 1 x fast audio file for each story part, introduction etc
* an image for each story part
* challenges for each story
    * from a single challenges.json file for each story
    * the customisation is when the challenges.html file gets created as the language name is added into the prompt

## Storage overview

We want to save the english story dialogue as dialogue.json

Then translations by language_name, then each phrase as an audio clip. We might want higher quality audio for the stories e.g. chirp3 voice

```raw
collections/LM1000/stories/story_murder_mystery/dialogue.json
collections/LM1000/stories/story_murder_mystery/dialogue/french/translated_dialogue.json
collections/LM1000/stories/story_murder_mystery/images/introduction.png etc
collections/LM1000/stories/story_murder_mystery/audio/french/introduction/part_0_sam.mp3
collections/LM1000/stories/story_murder_mystery/audio/french/introduction/part_1_alex.mp3
collections/LM1000/stories/story_murder_mystery/audio/french/introduction/fast/part_1_alex.mp3```


This allows us to quickly retrieve a phrase based on the bucket name and the phrase key, as well as modify individual phrases for later correction

In [24]:
config._load_config()
language_name = config.TARGET_LANGUAGE_NAME.lower()
print(language_name)
config.get_voice_models(enum_type="stories")

swedish


(VoiceInfo(name='en-GB-Chirp3-HD-Algenib', provider=<VoiceProvider.GOOGLE: 'google'>, voice_id='en-GB-Chirp3-HD-Algenib', language_code='en-GB'),
 VoiceInfo(name='aSLKtNoVBZlxQEMsnGL2', provider=<VoiceProvider.ELEVENLABS: 'elevenlabs'>, voice_id='aSLKtNoVBZlxQEMsnGL2', language_code='sv-SE'),
 VoiceInfo(name='7UMEOkIJdI4hjmR2SWNq', provider=<VoiceProvider.ELEVENLABS: 'elevenlabs'>, voice_id='7UMEOkIJdI4hjmR2SWNq', language_code='sv-SE'))

In [6]:
from src.gcs_storage import get_stories_from_collection

all_stories = get_stories_from_collection(collection="LM1000")

In [25]:
all_stories = ['story_midnight_garden_mystery']

## Loop all stories (text)

Translate and re-upload

In [26]:
# loop stories and translate
from src.gcs_storage import check_blob_exists, get_story_dialogue_path, get_story_translated_dialogue_path
from src.dialogue_generation import translate_and_upload_dialogue

for story_name in ['story_midnight_garden_mystery']:
    # get the dialogue
    story_file_path = get_story_dialogue_path(story_name, collection="LM1000")
    translated_file_path = get_story_translated_dialogue_path(story_name, collection="LM1000")
    if check_blob_exists(config.GCS_PRIVATE_BUCKET, translated_file_path):
        print(f"{story_name} already translated")
        #continue
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, story_file_path)
    translate_and_upload_dialogue(story_dialogue, story_name, collection="LM1000")

adding translations:   0%|          | 0/3 [00:00<?, ?it/s]

Beginning translation for introduction


adding translations:  33%|███▎      | 1/3 [00:02<00:04,  2.22s/it]

Translated dialogue
Beginning translation for development


adding translations:  67%|██████▋   | 2/3 [00:04<00:02,  2.18s/it]

Translated dialogue
Beginning translation for resolution


adding translations: 100%|██████████| 3/3 [00:06<00:00,  2.19s/it]

Translated dialogue





Translated dialogue uploaded to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/dialogue/swedish/translated_dialogue.json


## Add wiktionary links to each story utterance

In [27]:
for story_name in all_stories:
    print(f"processing {story_name}")
    # get the dialogue
    translated_file_path = get_story_translated_dialogue_path(story_name, collection="LM1000")
    if not check_blob_exists(config.GCS_PRIVATE_BUCKET, translated_file_path):
        print(f"{story_name} not yet translated")
        continue
    translated_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, translated_file_path)
    translated_dialogue_with_links = prepare_dialogue_with_wiktionary(translated_dialogue)
    # now re-upload it with embedded witkionary_links
    uploaded = upload_to_gcs(obj=translated_dialogue_with_links, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_file_path)
    print(f"uploaded {story_name} : {uploaded}")

processing story_midnight_garden_mystery


Getting dialogue links for story_parts: 100%|██████████| 3/3 [01:11<00:00, 23.73s/it]


uploaded story_midnight_garden_mystery : gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/dialogue/swedish/translated_dialogue.json


Generate audio and upload

### Generate audio and upload

### Loop through stories to generate audio

In [None]:
all_stories

In [28]:
from src.audio_generation import generate_dialogue_audio_and_upload
for story_name in all_stories:
    # get the dialogue
    translated_file_path = get_story_translated_dialogue_path(story_name, collection="LM1000")

    translated_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, translated_file_path)
    generate_dialogue_audio_and_upload(translated_dialogue, story_name, overwrite=True)

Processing story parts:   0%|          | 0/3 [00:00<?, ?it/s]

Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_0_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_1_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_2_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_3_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_4_alex.mp3


Processing story parts:  33%|███▎      | 1/3 [00:54<01:49, 54.88s/it]

Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/part_5_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_0_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_1_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_2_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_3_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_4_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_5_sam.mp3


Processing story parts:  67%|██████▋   | 2/3 [01:58<01:00, 60.08s/it]

Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/part_6_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_0_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_1_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_2_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_3_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_4_sam.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_5_alex.mp3




Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_6_sam.mp3


Processing story parts: 100%|██████████| 3/3 [03:09<00:00, 63.32s/it]

Generated and uploaded: gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/part_7_alex.mp3





### Fast Audio 
1 file for each story part

In [29]:
from src.audio_generation import generate_and_upload_fast_audio

for story_name in tqdm(all_stories):
    # uploads 1 fast mp3 file for each story part. 
    generate_and_upload_fast_audio(story_name, collection="LM1000", overwrite=True)

  0%|          | 0/1 [00:00<?, ?it/s]
[A
[A
[A
Collecting utterance audio for introduction: 6it [00:00, 15.23it/s]


Generating fast audio for introduction...




Uploaded fast audio for introduction to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/introduction/fast.mp3



[A
[A
[A
Collecting utterance audio for development: 7it [00:00, 13.89it/s]


Generating fast audio for development...




Uploaded fast audio for development to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/development/fast.mp3



[A
[A
[A
[A
Collecting utterance audio for resolution: 8it [00:00, 14.36it/s]


Generating fast audio for resolution...


Processing story_midnight_garden_mystery in swedish: 100%|██████████| 3/3 [00:15<00:00,  5.02s/it]
100%|██████████| 1/1 [00:15<00:00, 15.06s/it]

Uploaded fast audio for resolution to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/audio/swedish/resolution/fast.mp3





## Upload images
For when we have them stored locally

In [19]:
for story_name in all_stories:
    notebook_dir = Path().absolute()  # This gives src/notebooks
    story_dir = notebook_dir.parent / "outputs" / "stories"
    # get the dialogue
    story_file_path = get_story_dialogue_path(story_name, collection="LM1000")
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, story_file_path)
    for story_part in story_dialogue:
        image_file = story_dir / story_name / f"{story_name}_{story_part}.png"
        assert image_file.exists()
        upload_story_image(image_file, story_part, story_name)

Image uploaded to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/images/introduction.png
Image uploaded to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/images/development.png
Image uploaded to gs://audio-language-trainer-private-content/collections/LM1000/stories/story_midnight_garden_mystery/images/resolution.png


## Upload challenges

In [None]:
from src.chat import get_html_challenge_inputs, create_html_challenges
from src.gcs_storage import upload_to_gcs, get_story_challenges_path, get_story_translated_challenges_path
notebook_dir = Path().absolute()  # This gives src/notebooks
phrase_dir = notebook_dir.parent / "data" / "phrases" #where we store text files of phrases

for story_name in all_stories:
    story_dir = notebook_dir.parent / "outputs" / "stories" / story_name
    chat_dir = story_dir / config.TARGET_LANGUAGE_NAME
    scenario_dicts = load_json(file_path=f"{story_dir}/scenarios.json")
    # upload the scenarios to gcs
    challenge_file_path = get_story_challenges_path(story_name, collection="LM1000")
    upload_to_gcs(obj=scenario_dicts, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=challenge_file_path)
    challenges = get_html_challenge_inputs(scenario_dicts)
    chat_webpage_file = create_html_challenges(challenges, output_dir=chat_dir,
story_name=story_name)
    gcs_chat_file_path = get_story_translated_challenges_path(story_name)
    upload_to_gcs(obj=chat_webpage_file, bucket_name=config.GCS_PUBLIC_BUCKET, file_name=gcs_chat_file_path)