In [3]:
from google.auth import default
credentials, project = default()

Config file has been modified. Reloading...


In [8]:
%load_ext autoreload
%autoreload 2
import os
import sys
from tqdm import tqdm
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.config_loader import config
print(config.TARGET_LANGUAGE_NAME)
english_voice, female_voice, male_voice = config.get_voice_models()
english_voice_story, female_voice_story, male_voice_story = config.get_voice_models("stories")
COLLECTION = "WarmUp150"
print(female_voice.voice_id, female_voice_story.voice_id, male_voice_story.voice_id)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
French
fr-FR-Neural2-G fr-FR-Chirp3-HD-Zephyr fr-FR-Chirp3-HD-Puck


We now want a dictionary where the key is the hash of the phrase via clean_filename -> to make an efficient lookup

In [13]:
from src.audio_generation import (generate_and_upload_fast_audio,
                                  generate_dialogue_audio_and_upload,
                                  upload_phrases_audio_to_gcs)
from src.chat import create_html_challenges, get_html_challenge_inputs
from src.convert import clean_filename
from src.dialogue_generation import translate_and_upload_dialogue
from src.gcs_storage import (check_blob_exists, get_stories_from_collection,
                             get_story_challenges_path,
                             get_story_collection_path,
                             get_story_dialogue_path,
                             get_story_translated_dialogue_path,
                             get_translated_phrases_path,
                             get_wiktionary_cache_path, read_from_gcs,
                             upload_to_gcs)
from src.story import (create_album_files, create_and_upload_html_story,
                       prepare_dialogue_with_wiktionary,
                       prepare_story_data_from_gcs)
from src.translation import (review_story_dialogue_translations,
                             review_translated_phrases_batch,
                             translate_phrases)
from src.wiktionary import add_wiktionary_links

In [None]:


all_stories = get_stories_from_collection(collection=COLLECTION)

story_collection = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET,
file_path = get_story_collection_path(collection=COLLECTION))
translated_phrases_path = get_translated_phrases_path(collection=COLLECTION)

language_name_lower = config.TARGET_LANGUAGE_NAME.lower()

# google translate
results = dict()
for story in all_stories:
    # Extract just the phrases from the story's phrase list
    english_phrases = [item['phrase'] for item in story_collection[story]]
    translated_phrases = translate_phrases(english_phrases)
    for phrase, translation in translated_phrases:
        phrase_key = clean_filename(phrase)
        results[phrase_key] = {"english": phrase,
                             language_name_lower: translation}
        
# refine translations
upload_to_gcs(results, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_phrases_path)
phrase_translations = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET, file_path=translated_phrases_path)
improved_translations = review_translated_phrases_batch(phrase_translations, model = "claude-3-5-sonnet-latest")
upload_to_gcs(obj=improved_translations, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_phrases_path)

#wiktionary
word_link_cache= read_from_gcs(config.GCS_PRIVATE_BUCKET, file_path = get_wiktionary_cache_path())
phrase_translations, word_link_cache = add_wiktionary_links(improved_translations, word_link_cache, overwrite=False)
upload_to_gcs(obj=phrase_translations, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=get_translated_phrases_path(collection=COLLECTION))
upload_to_gcs(word_link_cache, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=get_wiktionary_cache_path())

#process audio
result = upload_phrases_audio_to_gcs(phrase_translations, overwrite=False)

# translate stories
for story_name in all_stories:
    # get the dialogue
    story_file_path = get_story_dialogue_path(story_name, collection=COLLECTION)
    translated_file_path = get_story_translated_dialogue_path(story_name, collection=COLLECTION)
    if check_blob_exists(config.GCS_PRIVATE_BUCKET, translated_file_path):
        print(f"{story_name} already translated")
        #continue
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, story_file_path)
    translate_and_upload_dialogue(story_dialogue, story_name, collection=COLLECTION)

# refine translations
for story_name in all_stories:
    # get the dialogue
    translated_file_path = get_story_translated_dialogue_path(story_name, collection=COLLECTION)
    translated_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, translated_file_path)
    reviewed_dialogue = review_story_dialogue_translations(translated_dialogue)
    upload_to_gcs(obj=reviewed_dialogue, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_file_path)

# add wiktionary links
for story_name in all_stories:

    # get the dialogue
    translated_file_path = get_story_translated_dialogue_path(story_name, collection=COLLECTION)
    if not check_blob_exists(config.GCS_PRIVATE_BUCKET, translated_file_path):
        print(f"{story_name} not yet translated")
        continue
    translated_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, translated_file_path)
    translated_dialogue_with_links = prepare_dialogue_with_wiktionary(translated_dialogue)
    # now re-upload it with embedded witkionary_links
    upload_to_gcs(obj=translated_dialogue_with_links, bucket_name=config.GCS_PRIVATE_BUCKET, file_name=translated_file_path)

# add audio
for story_name in all_stories:
    # get the dialogue
    translated_file_path = get_story_translated_dialogue_path(story_name, collection=COLLECTION)

    translated_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, translated_file_path)
    generate_dialogue_audio_and_upload(translated_dialogue, story_name, collection=COLLECTION, overwrite=True)

# add fast audio
for story_name in tqdm(all_stories):
    # uploads 1 fast mp3 file for each story part. 
    generate_and_upload_fast_audio(story_name, collection=COLLECTION, overwrite=True)

# challenges

for story_name in all_stories:
    challenge_file_path = get_story_challenges_path(story_name, collection=COLLECTION)
    scenario_dicts = read_from_gcs(bucket_name=config.GCS_PRIVATE_BUCKET, file_path=challenge_file_path)
    challenges = get_html_challenge_inputs(scenario_dicts)
    chat_webpage_file = create_html_challenges(challenges, story_name=story_name, collection=COLLECTION) # this creates and uploades

# create stories
for story_name in all_stories:
    print(story_name)
    story_data = prepare_story_data_from_gcs(story_name, collection=COLLECTION)
    create_and_upload_html_story(story_data, story_name, collection=COLLECTION)
    create_album_files(story_data, story_name, collection=COLLECTION)


## How to get a translation from the dictionary

In [None]:
from src.story import update_all_index_pages_hierarchical, upload_styles_to_gcs
upload_styles_to_gcs()
update_all_index_pages_hierarchical(languages=["French", "Spanish", "German", "Swedish"], collections=["LM1000", "WarmUp150"])