# Import

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import os
from dotenv import load_dotenv
import sys
import os
import networkx as nx
import ipywidgets as widgets
from collections import defaultdict
from tqdm import tqdm

# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Load environment variables from .env file
load_dotenv()

from src.dialogue_generation import get_vocab_from_dialogue, update_vocab_usage
from src.dialogue_generation import generate_story_plan, generate_dialogue_prompt, generate_dialogue, generate_recap
from src.audio_generation import  text_to_speech, play_audio, generate_audio_from_dialogue, generate_normal_and_fast_audio, generate_translated_phrase_audio, join_audio_segments, export_audio, async_process_phrases
from src.phrase import generate_practice_phrases_from_dialogue
from src.initialise import initialise_usage_data
from src.utils import save_json, convert_defaultdict, save_defaultdict, load_json, create_pdf_booklet
from src.translation import translate_dialogue, translate_phrases
from src.audio_generation import create_m4a_with_timed_lyrics

STORY_NAME = "swedish_high_coast_2"
STORY_DATA_PATH = f"../outputs/story_data_{STORY_NAME}.json"


Searching for config.json...
Checking: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\notebooks\config.json
Checking: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\config.json
Found config file at: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\config.json
Successfully loaded config from: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\config.json
Multiple country codes available for en: en-AU, en-GB, en-IN, en-US
Config loader initialized.
Config file location: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\config.json
Current working directory: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\notebooks
Searching for config.json...
Checking: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\notebooks\config.json
Checking: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\config.json
Found config file at: c:\Users\i5\Documents\Python Scripts\audio-language-trainer\src\con

## Setup Google Cloud credentials and prerequisites
You will need a Google Project with the following APIs enabled:
* Text to Speech
* Translate
* Vertex AI with the following Anthropic models enabled (from the model garden)
    * Sonnet 3.5
    * Haiku
* Add your GOOGLE_PROJECT_ID to the .env file

You should alter src/config.json which contains your target language.


In [3]:
from google.auth import default
credentials, project = default()

# Audio Language Trainer Workflow

The aim of this project is to create audio material for you to practise a foreign language. It needs to be engaging and be tailored to words you want to practise. 

The overall steps we follow are:

1. Create an outline story plan based on a theme you select (e.g. 'an adventure', 'a holiday in Rome'). An LLM produces a story plan following a typical story arc (exposition, rising action, climax, falling action, resolution). This ensures an engaging plot.
2. Flesh out the story using your practice vocabulary and grammatical concepts. Vocab and concepts are sampled from lists you provide in the 'data' folder (vocab_usage.json and grammar_concepts_usage.json), with sampling being skewed towards words you haven't heard yet. The output here is a dialogue between two people (Sam and Alex).

Recaps are generated between each story part so when the LLM generates the next dialogue it logically continues from the previous one.

3. The dialogue is broken up into shorter practice phrases via a 'language graph' concept, so we give you not just the long-form dialogue to listen and practise to, but smaller, mixed-up phrases based on the vocab in the story, starting small and buliding to more complex phrases.
4. Your vocab list is updated based on the produced dialogue.
5. The smaller phrases and main dialogue are translated into your target language and convert to speech.
6. Research shows that listening to double-speed audio (on words you already known) can help with your listening comprehension for a foreign language (it helps the brain with the ability to separate distinct words). We therefore create a fast version of the dialogue for listening practice.
7. The audio files are stiched together to create an MP3 file for each part in the story (there are 5 parts to the story). The stages for each audio lesson are: 
* dialogue in the target language
* practice phrases of the form 'how do you say: "practice phrase' in 'target language'?". A pause (where you speak in the foreign language), then the correct translation is played twice, first fast, then slow.
* repeat of the dialogue in the target language so you can satisfy yourself you understand it properly
* 12 repeated playings of the fast version of the dialogue to improve your listening comprehension.

The intent is then you would listen to the next audio lesson in the story.


## Setup your vocab and grammatical concepts
You should populate or edit
* known_vocab_list.json 
* grammar_concepts.json

### Initiliase the vocab and grammar counters
This creates vocab_usage.json (setting all values to 0) and grammar_concepts_usage.json (setting all values to 'true' and counts to 0)

You can tweak these to minimise what words and concepts you are exposed to

In [None]:
initialise_usage_data(overwrite=False) #the overwrite commands stops you wiping all your usage data if it already exists

# Being Lesson Generation

## Create a story plan

In [7]:
%autoreload 2
from src.dialogue_generation import get_least_used_words, add_usage_to_words

verbs_for_story = get_least_used_words("verbs", 10)
vocab_for_story = get_least_used_words("vocab", 30)

story_plan = generate_story_plan(story_guide = "hiking the high coast in Sweden", verb_list=verbs_for_story, vocab_list=vocab_for_story, test = False, story_name= STORY_NAME) #the test parameter will provide pre-canned responses avoiding LLM costs

Data saved to ../outputs/story_plan_swedish_high_coast_2.json


In [8]:
verbs_for_story_usage = add_usage_to_words(verbs_for_story, "verbs")
vocab_for_story_usage = add_usage_to_words(vocab_for_story, "vocab")

## Create all dialogue

1. Create dialouge LLM prompt based on the story part
2. LLM generates dialogue
3. LLM generates recap
4. move to next story part and repeat


In [None]:
PAY_FOR_LLM = True

if PAY_FOR_LLM:
    story_data_dict = defaultdict(lambda: defaultdict(str))
    recap = "This is the beginning of the story."
    for step, story_part in enumerate(list(story_plan.keys())):
        prompt = generate_dialogue_prompt(story_part=story_part,
                                        story_part_outline=story_plan[story_part],
                                        last_recap = recap,
                                        verb_usage_str=verbs_for_story_usage,
                                        vocab_usage_str=vocab_for_story_usage,
                                        verb_use_count=5,
                                        vocab_use_count=10,
                                        grammar_concept_count=5,
                                        grammar_use_count=3)
        dialogue = generate_dialogue(prompt)
        vocab_used = get_vocab_from_dialogue(dialogue)
        update_vocab_usage(vocab_used)
        verbs_for_story_usage = add_usage_to_words(verbs_for_story, "verbs")
        vocab_for_story_usage = add_usage_to_words(vocab_for_story, "vocab")
        recap = generate_recap(dialogue, test=False)
        story_data_dict[story_part]["dialogue_generation_prompt"] = prompt
        story_data_dict[story_part]["dialogue"] = dialogue
        story_data_dict[story_part]["recap"] = recap

save_defaultdict(story_data_dict, STORY_DATA_PATH)


Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json


### Build phrases from dialogue

Here we:
1. Break up the dialogue into separate sentences. For this bit we don't care who the speaker is, we just want to create different phrases of different lengths and combinations based on the vocab int the dialogue
2. We use another LLM call to do this, with some one-shot learning

In [3]:
story_data_dict = load_json(STORY_DATA_PATH)

In [8]:
for story_part in story_data_dict:
    dialogue = story_data_dict[story_part]["dialogue"]
    story_data_dict[story_part]["corrected_phrase_list"] = generate_practice_phrases_from_dialogue(dialogue)

In [9]:
save_defaultdict(story_data_dict, STORY_DATA_PATH)

Data saved to ../outputs/story_data_swedish_high_coast.json


### Translate dialogue and phrases

In [10]:
PAY_FOR_TRANSLATE_API = True

if PAY_FOR_TRANSLATE_API:

    for story_part in tqdm(story_data_dict):
        dialogue = story_data_dict[story_part]["dialogue"]
        translated_dialogue = translate_dialogue(dialogue)

        corrected_phrase_list = story_data_dict[story_part]["corrected_phrase_list"]
        translated_phrase_list = translate_phrases(corrected_phrase_list)

        story_data_dict[story_part]["translated_dialogue"] = translated_dialogue
        story_data_dict[story_part]["translated_phrase_list"] = translated_phrase_list


100%|██████████| 5/5 [00:17<00:00,  3.51s/it]


In [11]:
save_defaultdict(story_data_dict, STORY_DATA_PATH)

Data saved to ../outputs/story_data_swedish_high_coast.json


## Generate Audio Lesson

The steps here are
1. The target language dialogue at normal speed
2. Each corrected and translated phrase in the form english - target fast - target slow
3. Each dialogue utterance in the form english - target fast - target slow
4. The 2 x sped up target language dialogue

In [4]:
# attach audio segment data to story_data.json for later incorporation into M4A file
%autoreload 2
from src.audio_generation import async_process_phrases, generate_audio_from_dialogue, async_process_phrases_v2
%autoawait


IPython autoawait is `on`, and set to use `asyncio`


In [26]:
story_data_dict =load_json(STORY_DATA_PATH)

In [27]:
translated_phrases = story_data_dict["resolution"]["translated_phrase_list"][0:3]
tranlsated_phrases_audio = await async_process_phrases_v2(translated_phrases)

<coroutine object AsyncFuture.result at 0x000001DF906B42B0>
Waiting for Jag måste avsluta...


AttributeError: 'coroutine' object has no attribute 'done'

<coroutine object AsyncFuture.result at 0x000001DFFA5FE330>
Waiting for I must finish...
<coroutine object AsyncFuture.result at 0x000001DFFA5FDF20>
Waiting for It's beautiful...
<coroutine object AsyncFuture.result at 0x000001DF906B4AD0>
Waiting for Det är vackert...
<coroutine object AsyncFuture.result at 0x000001DF90287510>
Waiting for Du undervisar...
<coroutine object AsyncFuture.result at 0x000001DF906B41E0>
Waiting for Det är vackert...
<coroutine object AsyncFuture.result at 0x000001DF903CF780>
Waiting for Jag måste avsluta...
<coroutine object AsyncFuture.result at 0x000001DFFA79E740>
Waiting for Du undervisar...
<coroutine object AsyncFuture.result at 0x000001DF903CF780>
Waiting for You're teaching...


In [None]:
translated_phrases = story_data_dict["resolution"]["translated_phrase_list"]
tranlsated_phrases_audio = await async_process_phrases(translated_phrases)

In [5]:

PAY_FOR_TEXT_TO_SPEECH = True

story_data_dict =load_json(STORY_DATA_PATH)
if PAY_FOR_TEXT_TO_SPEECH:
    for story_part in tqdm(story_data_dict):

        translated_dialogue_audio_segments = generate_audio_from_dialogue(story_data_dict[story_part]["translated_dialogue"])
        story_data_dict[story_part]["translated_dialogue_audio"] = translated_dialogue_audio_segments
        normal_translated_clip, fast_translated_clips = generate_normal_and_fast_audio(translated_dialogue_audio_segments)
        story_data_dict[story_part]["translated_dialogue_audio_fast"] = fast_translated_clips
        print(f"{story_part} dialogue done")
        #now do phrases asynchronoulsy (still unsure if Google API allows this, not getting huge speed up)
        translated_phrases = story_data_dict[story_part]["translated_phrase_list"]
        tranlsated_phrases_audio = await async_process_phrases(translated_phrases)
        story_data_dict[story_part]["translated_phrase_list_audio"] = tranlsated_phrases_audio
        print(f"{story_part} phrases done")


       


  0%|          | 0/5 [00:00<?, ?it/s]

exposition dialogue done


 20%|██        | 1/5 [04:37<18:29, 277.45s/it]

exposition phrases done
rising_action dialogue done


 40%|████      | 2/5 [09:18<13:58, 279.46s/it]

rising_action phrases done
climax dialogue done


 60%|██████    | 3/5 [14:08<09:28, 284.45s/it]

climax phrases done
falling_action dialogue done


 80%|████████  | 4/5 [18:59<04:46, 286.76s/it]

falling_action phrases done
resolution dialogue done


100%|██████████| 5/5 [23:26<00:00, 281.40s/it]

resolution phrases done





TypeError: Object of type AudioSegment is not JSON serializable

In [8]:
import pickle
# Save the dictionary to a file
with open(F'{STORY_DATA_PATH}.pkl', 'wb') as file:
    pickle.dump(story_data_dict, file)

### Generate M4A file with synchronised captions

In [6]:
from pydub import AudioSegment
from src.config_loader import config
#get lists and audio clips synced together
full_audio_list = []
full_captions_list = []


#fast dialogue (no text)
PAUSE_TEXT = "---------"
THINKING_GAP = AudioSegment.silent(duration=config.THINKING_GAP_MS)
GAP_BETWEEN_PHRASES = AudioSegment.silent(duration=500)
#translated dialogue

for story_part in story_data_dict:

    audio_list = []
    captions_list = []
    dialogue_list = [utterence["text"] for utterence in story_data_dict[story_part]["translated_dialogue"]]
    dialogue_audio_list = story_data_dict[story_part]["translated_dialogue_audio"]

    audio_list.append(GAP_BETWEEN_PHRASES)
    captions_list.append(f"{story_part} - First dialogue")

    audio_list.extend(dialogue_audio_list)
    captions_list.extend(dialogue_list)
    #print(f"audio {len(audio_list)} - captions {len(captions_list)}")

    audio_list.append(GAP_BETWEEN_PHRASES)
    captions_list.append(f"{story_part} - Practice phrases")
    
    for step, phrase in enumerate(story_data_dict[story_part]["translated_phrase_list"]):
        english_text = phrase[0]
        target_text = phrase[1]

        english_audio = story_data_dict[story_part]["translated_phrase_list_audio"][step][0]
        target_audio_slow = story_data_dict[story_part]["translated_phrase_list_audio"][step][1]
        target_audio_normal = story_data_dict[story_part]["translated_phrase_list_audio"][step][2]

        audio_list.append(english_audio)
        captions_list.append(english_text)

        audio_list.append(THINKING_GAP)
        captions_list.append(PAUSE_TEXT)

        audio_list.append(target_audio_normal)
        captions_list.append(target_text)

        audio_list.append(GAP_BETWEEN_PHRASES)
        captions_list.append(PAUSE_TEXT)

        audio_list.append(target_audio_slow)
        captions_list.append(target_text)

        audio_list.append(GAP_BETWEEN_PHRASES)
        captions_list.append(PAUSE_TEXT)

    audio_list.append(story_data_dict[story_part]["translated_dialogue_audio_fast"])
    captions_list.append(f"{story_part} - Repeated Fast Dialogue")

    audio_list.append(GAP_BETWEEN_PHRASES)
    captions_list.append(f"{story_part} - Final Dialogue")
    
    audio_list.extend(dialogue_audio_list)
    captions_list.extend(dialogue_list)

    create_m4a_with_timed_lyrics(audio_list, captions_list, F"{STORY_NAME}_{story_part}.m4a")
    full_audio_list.extend(audio_list)
    full_captions_list.extend(captions_list)



In [7]:
#add the complete story as a single dialogue at the end

all_dialogue_audio = []
all_dialogue_captions = []

for story_part in story_data_dict:
    dialogue_list = [utterence["text"] for utterence in story_data_dict[story_part]["translated_dialogue"]]
    dialogue_audio_list = story_data_dict[story_part]["translated_dialogue_audio"]
    all_dialogue_audio.extend(dialogue_audio_list)
    all_dialogue_captions.extend(dialogue_list)

    all_dialogue_audio.append(GAP_BETWEEN_PHRASES)
    all_dialogue_captions.append(PAUSE_TEXT)

full_audio_list.extend(all_dialogue_audio)
full_captions_list.extend(all_dialogue_captions)


create_m4a_with_timed_lyrics(full_audio_list, full_captions_list, F"final_lesson_{STORY_NAME}.m4a")

## Create PDF Booklet
So you can see the spelling of the phrases and dialogue

In [8]:
create_pdf_booklet(story_data_dict, output_filename=f"../outputs/story_booklet_{STORY_NAME}.pdf")
