# Import

In [1]:
%load_ext autoreload

In [39]:
%autoreload 2
import os
from dotenv import load_dotenv
import sys
import os
import networkx as nx
import ipywidgets as widgets
from collections import defaultdict
from tqdm import tqdm

# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Load environment variables from .env file
load_dotenv()

from src.dialogue_generation import generate_story_plan, generate_dialogue_prompt, generate_dialogue, generate_recap
from src.audio_generation import  text_to_speech, play_audio, generate_audio_from_dialogue, generate_normal_and_fast_audio, generate_translated_phrase_audio, join_audio_segments, export_audio
from src.phrase import correct_grammar, correct_phrases, generate_practice_phrases_from_dialogue
from src.initialise import initialise_usage_data
from src.utils import save_json, convert_defaultdict, save_defaultdict, load_json
from src.translation import translate_dialogue, translate_phrases

STORY_DATA_PATH = "../data/story_data.json"


## Setup Google Cloud credentials and prerequisites
You will need a Google Project with the following APIs enabled:
* Text to Speech
* Translate
* Vertex AI with the following Anthropic models enabled (from the model garden)
    * Sonnet 3.5
    * Haiku
* Add your GOOGLE_PROJECT_ID to the .env file

You should alter src/config.json which contains your target language.


In [3]:
from google.auth import default
credentials, project = default()

# Audio Language Trainer Workflow

The aim of this project is to create audio material for you to practise a foreign language. It needs to be engaging and be tailored to words you want to practise. 

The overall steps we follow are:

1. Create an outline story plan based on a theme you select (e.g. 'an adventure', 'a holiday in Rome'). An LLM produces a story plan following a typical story arc (exposition, rising action, climax, falling action, resolution). This ensures an engaging plot.
2. Flesh out the story using your practice vocabulary and grammatical concepts. Vocab and concepts are sampled from lists you provide in the 'data' folder (vocab_usage.json and grammar_concepts_usage.json), with sampling being skewed towards words you haven't heard yet. The output here is a dialogue between two people (Sam and Alex).

Recaps are generated between each story part so when the LLM generates the next dialogue it logically continues from the previous one.

3. The dialogue is broken up into shorter practice phrases via a 'language graph' concept, so we give you not just the long-form dialogue to listen and practise to, but smaller, mixed-up phrases based on the vocab in the story, starting small and buliding to more complex phrases.
4. Your vocab list is updated based on the produced dialogue.
5. The smaller phrases and main dialogue are translated into your target language and convert to speech.
6. Research shows that listening to double-speed audio (on words you already known) can help with your listening comprehension for a foreign language (it helps the brain with the ability to separate distinct words). We therefore create a fast version of the dialogue for listening practice.
7. The audio files are stiched together to create an MP3 file for each part in the story (there are 5 parts to the story). The stages for each audio lesson are: 
* dialogue in the target language
* practice phrases of the form 'how do you say: "practice phrase' in 'target language'?". A pause (where you speak in the foreign language), then the correct translation is played twice, first fast, then slow.
* repeat of the dialogue in the target language so you can satisfy yourself you understand it properly
* 12 repeated playings of the fast version of the dialogue to improve your listening comprehension.

The intent is then you would listen to the next audio lesson in the story.


## Setup your vocab and grammatical concepts
You should populate or edit
* known_vocab_list.json 
* grammar_concepts.json

### Initiliase the vocab and grammar counters
This creates vocab_usage.json (setting all values to 0) and grammar_concepts_usage.json (setting all values to 'true' and counts to 0)

You can tweak these to minimise what words and concepts you are exposed to

In [19]:
initialise_usage_data(overwrite=False) #the overwrite commands stops you wiping all your usage data if it already exists

Usage files already exist. Set overwrite=True to reinitialize.


# Being Lesson Generation

## Create a story plan

In [74]:

#this is where all the text data goes (prompts, dialogue, recaps etc)

In [25]:
story_plan = generate_story_plan(story_guide = "an outdoor adventure", test = True) #the test parameter will provide pre-canned responses avoiding LLM costs
story_plan

Data saved to ../data/story_plan.json


{'exposition': 'Two friends, Alex and Sam, decide to learn a new language together.',
 'rising_action': 'They face challenges in their studies and personal lives that test their commitment.',
 'climax': 'A language competition is announced, pushing them to their limits.',
 'falling_action': 'They prepare for the competition, supporting each other through difficulties.',
 'resolution': 'They participate in the competition, growing closer as friends and more confident in their language skills.'}

## Create all dialogue

1. Create dialouge LLM prompt based on the story part
2. LLM generates dialogue
3. LLM generates recap
4. move to next story part and repeat


In [61]:
PAY_FOR_LLM = False

if PAY_FOR_LLM:
    story_data_dict = defaultdict(lambda: defaultdict(str))
    recap = "This is the beginning of the story."
    for step, story_part in enumerate(story_plan.keys()):
        prompt = generate_dialogue_prompt(story_part=story_part,
                                        story_part_outline=story_plan[story_part],
                                        last_recap = recap,
                                        verb_count=10,
                                        verb_use_count=5,
                                        vocab_count=30,
                                        vocab_use_count=10,
                                        grammar_concept_count=10,
                                        grammar_use_count=3)
        dialogue = generate_dialogue(prompt)
        recap = generate_recap(dialogue, test=False)
        story_data_dict[story_part]["dialogue_generation_prompt"] = prompt
        story_data_dict[story_part]["dialogue"] = dialogue
        story_data_dict[story_part]["recap"] = recap


In [64]:
#save_defaultdict(story_data_dict, STORY_DATA_PATH)


Data saved to ../data/story_data.json


### Update the vocab lists based on the dialogue

The grammatical concepts are updated during prompt creation as it is more difficult to extract these from the dialogue

In [65]:
%autoreload 2
from src.dialogue_generation import get_vocab_from_dialogue, update_vocab_usage

for story_part in story_plan.keys():
    dialogue = story_data_dict[story_part]["dialogue"]
    vocab_used = get_vocab_from_dialogue(dialogue)
    update_vocab_usage(vocab_used)

Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json
Data saved to ../data/vocab_usage.json


### Build phrases from dialogue

Here we:
1. Break up the dialogue into separate sentences. For this bit we don't care who the speaker is, we just want to create different phrases of different lengths and combinations based on the vocab int the dialogue
2. We use another LLM call to do this, with some one-shot learning

In [57]:
story_data_dict = load_json(STORY_DATA_PATH)

In [40]:
for story_part in story_data_dict:
    dialogue = story_data_dict[story_part]["dialogue"]
    story_data_dict[story_part]["corrected_phrase_list"] = generate_practice_phrases_from_dialogue(dialogue)

In [41]:
save_defaultdict(story_data_dict, STORY_DATA_PATH)

Data saved to ../data/story_data.json


### Translate dialogue and phrases

In [42]:
PAY_FOR_TRANSLATE_API = True

if PAY_FOR_TRANSLATE_API:

    for story_part in tqdm(story_data_dict):
        dialogue = story_data_dict[story_part]["dialogue"]
        translated_dialogue = translate_dialogue(dialogue)

        corrected_phrase_list = story_data_dict[story_part]["corrected_phrase_list"]
        translated_phrase_list = translate_phrases(corrected_phrase_list)

        story_data_dict[story_part]["translated_dialogue"] = translated_dialogue
        story_data_dict[story_part]["translated_phrase_list"] = translated_phrase_list


100%|██████████| 7/7 [00:14<00:00,  2.06s/it]
100%|██████████| 33/33 [01:06<00:00,  2.03s/it]
100%|██████████| 7/7 [00:14<00:00,  2.02s/it]
100%|██████████| 36/36 [01:11<00:00,  2.00s/it]
100%|██████████| 8/8 [00:15<00:00,  1.94s/it]
100%|██████████| 35/35 [01:08<00:00,  1.97s/it]
100%|██████████| 7/7 [00:13<00:00,  2.00s/it]
100%|██████████| 33/33 [01:05<00:00,  1.98s/it]
100%|██████████| 8/8 [00:17<00:00,  2.15s/it]
100%|██████████| 38/38 [01:15<00:00,  1.99s/it]
100%|██████████| 5/5 [07:04<00:00, 84.81s/it]


In [43]:
save_defaultdict(story_data_dict, STORY_DATA_PATH)

Data saved to ../data/story_data.json


## Generate Audio Lesson

The steps here are
1. The target language dialogue at normal speed
2. Each corrected and translated phrase in the form english - target fast - target slow
3. Each dialogue utterance in the form english - target fast - target slow
4. The 2 x sped up target language dialogue

### Fast and slow translated dialogue

In [44]:
PAY_FOR_TEXT_TO_SPEECH = True

audio_lessons = []

if PAY_FOR_TEXT_TO_SPEECH:
    for story_part in tqdm(story_data_dict):

        single_audio_lesson = []
        single_audio_lesson_filename = story_part + ".mp3"
        translated_dialogue_audio_segments = generate_audio_from_dialogue(story_data_dict[story_part]["translated_dialogue"])
        normal_translated_clip, fast_translated_clips = generate_normal_and_fast_audio(translated_dialogue_audio_segments)
        single_audio_lesson.append(normal_translated_clip)

        #now do phrases
        for translated_phrase in tqdm(story_data_dict[story_part]["translated_phrase_list"]):
            phrase_audio = generate_translated_phrase_audio(translated_phrase)
            single_audio_lesson.append(phrase_audio)

        #now add fast bit at the end
        single_audio_lesson.append(fast_translated_clips)
        single_audio_lesson = join_audio_segments(single_audio_lesson)
        audio_lessons.append(single_audio_lesson)
        
        export_audio(single_audio_lesson, f"../audio/{single_audio_lesson_filename}")
    
    full_lesson = join_audio_segments(audio_lessons, gap_ms=3000)
    export_audio(full_lesson, filename="../audio/full_lesson.mp3")


100%|██████████| 7/7 [00:14<00:00,  2.05s/it]
100%|██████████| 33/33 [03:53<00:00,  7.07s/it]
100%|██████████| 7/7 [00:14<00:00,  2.05s/it]]
100%|██████████| 36/36 [04:16<00:00,  7.13s/it]
100%|██████████| 8/8 [00:16<00:00,  2.04s/it]]
100%|██████████| 35/35 [03:50<00:00,  6.58s/it]
100%|██████████| 7/7 [00:17<00:00,  2.55s/it]]
100%|██████████| 33/33 [03:40<00:00,  6.68s/it]
100%|██████████| 8/8 [00:18<00:00,  2.31s/it]]
100%|██████████| 38/38 [04:34<00:00,  7.22s/it]
100%|██████████| 5/5 [21:54<00:00, 262.89s/it]


In [66]:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT

def create_comprehensive_pdf(story_data_dict, output_filename="comprehensive_story.pdf"):
    doc = SimpleDocTemplate(output_filename, pagesize=letter)
    elements = []
    
    styles = getSampleStyleSheet()
    title_style = styles['Heading1']
    title_style.alignment = 1  # Center alignment
    subtitle_style = styles['Heading2']
    subtitle_style.alignment = 1  # Center alignment

    # Create a custom style for table cells
    cell_style = ParagraphStyle(
        'CellStyle',
        parent=styles['Normal'],
        fontSize=10,
        leading=12,
        alignment=TA_LEFT,
    )

    elements.append(Paragraph("Comprehensive Story Translation", title_style))
    elements.append(Spacer(1, 12))

    for story_part, data in story_data_dict.items():
        # Add story part title
        elements.append(Paragraph(story_part.capitalize(), subtitle_style))
        elements.append(Spacer(1, 12))

        # Process translated phrases
        if "translated_phrase_list" in data:
            elements.append(Paragraph("Translated Phrases", styles['Heading3']))
            phrases_data = [["English", "Target Language"]]  # Table header
            for eng, swe in data["translated_phrase_list"]:
                phrases_data.append([
                    Paragraph(eng, cell_style),
                    Paragraph(swe, cell_style)
                ])
            
            phrases_table = Table(phrases_data, colWidths=[250, 250])
            phrases_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
                ('FONTSIZE', (0, 0), (-1, 0), 14),
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                ('TEXTCOLOR', (0, 1), (-1, -1), colors.black),
                ('FONTSIZE', (0, 1), (-1, -1), 10),
                ('TOPPADDING', (0, 1), (-1, -1), 6),
                ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ]))
            elements.append(phrases_table)
            elements.append(Spacer(1, 12))

        # Process dialogue
        if "dialogue" in data and "translated_dialogue" in data:
            elements.append(Paragraph("Dialogue", styles['Heading3']))
            dialogue_data = [["English", "Target Language"]]  # Table header
            for eng, swe in zip(data["dialogue"], data["translated_dialogue"]):
                dialogue_data.append([
                    Paragraph(f"{eng['text']}", cell_style),
                    Paragraph(f"{swe['text']}", cell_style)
                ])
            
            dialogue_table = Table(dialogue_data, colWidths=[250, 250])
            dialogue_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
                ('FONTSIZE', (0, 0), (-1, 0), 14),
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                ('TEXTCOLOR', (0, 1), (-1, -1), colors.black),
                ('FONTSIZE', (0, 1), (-1, -1), 10),
                ('TOPPADDING', (0, 1), (-1, -1), 6),
                ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ]))
            elements.append(dialogue_table)
            elements.append(Spacer(1, 24))  # Add extra space between story parts

    doc.build(elements)



In [67]:
create_comprehensive_pdf(story_data_dict)