In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
from dotenv import load_dotenv
import importlib
import sys
import os
import pickle
from pathlib import Path
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.config_loader import config, VoiceManager
from src.phrase import generate_phrases_with_llm, generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases
from src.utils import (load_json, create_test_story_dict, anthropic_generate, save_text_file,
load_text_file, save_json, load_json, load_pickle)
from src.convert import clean_filename
from src.gcs_storage import upload_to_gcs, get_phrase_path, get_phrase_index_path, read_from_gcs, get_phrase_to_story_index_path, get_story_collection_path, get_story_dialogue_path
from src.anki_tools import AnkiCollectionReader, export_to_anki_with_images, get_deck_contents
from src.dialogue_generation import get_story_prompt, generate_story
from src.config_loader import config, VoiceManager, VoiceInfo, VoiceType, VoiceProvider
from src.generate import add_audio, add_translations
from src.nlp import plot_vocabulary_growth, calculate_new_words
from pprint import pprint
import random
import os


setting voice override: fr-FR-Chirp3-HD-Zephyr
setting voice override: fr-FR-Chirp3-HD-Puck
FFmpeg path added to system PATH: C:\Program Files\ffmpeg-7.0-essentials_build\bin


## Sorting stories

We need to now get stories from the story collection path and refer to the phrase.json and index.json files in order to help sort them

In [3]:
from src.gcs_storage import get_phrase_index_path, get_story_index_path

story_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_collection_path(collection="LM1000"))
LM1000_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_path(collection="LM1000"))
phrase_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_index_path(collection="LM1000"))
story_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))



In [None]:
from src.phrase import build_phrase_to_story_index
# Build the index
phrase_to_stories = build_phrase_to_story_index(collection="LM1000")

# Save it to GCS
upload_to_gcs(
    phrase_to_stories,
    bucket_name=config.GCS_PRIVATE_BUCKET,
    file_name=get_phrase_to_story_index_path(collection="LM1000"),
)

In [None]:
#phrases without a story:

phrases_without_story = []
for phrase in LM1000_phrases:
    if clean_filename(phrase) not in phrase_to_stories:
        phrases_without_story.append(phrase)



In [35]:
# let's get all story dialogue
from collections import defaultdict

all_story_dialogue = defaultdict(dict)
for story_name in story_phrases:
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_dialogue_path(story_name, collection="LM1000"))
    all_story_dialogue[story_name] = story_dialogue


In [94]:
all_story_dialogue.keys()

dict_keys(['story_the_birthday_party', 'story_camping_trip_gone_awry', 'story_a_fishing_trip', 'story_job_interview_gone_wrong', 'story_midnight_garden_mystery', 'story_rainy_football_match', 'story_sunset_wedding_blues', 'story_team_spirit', 'story_winter_in_sweden', 'story_forgetful_diver', 'story_unexpected_career_change', 'story_brussels_blend', 'story_road_trip', 'story_the_marathon', 'story_better_than_a_movie', 'story_teaching_music', 'story_the_power_cut', 'story_a_missed_stop', 'story_unexpected_wedding_guests', 'story_workplace_stress_vacation'])

In [95]:
from src.gcs_storage import get_story_index_path
from src.nlp import create_story_index

story_index = create_story_index(all_story_dialogue)



Indexing stories...: 100%|██████████| 20/20 [10:08<00:00, 30.43s/it]


In [96]:
upload_to_gcs(story_index, config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))


'gs://audio-language-trainer-private-content/collections/LM1000/story_index.json'

In [12]:
len(story_index['story_vocab']['story_the_birthday_party']['verbs'])

45

In [9]:
from src.nlp import assign_phrases_to_stories
assignments = assign_phrases_to_stories(
    phrase_index=phrase_index,
    story_index=story_index,
    max_phrases_per_story=int(847/20),
)


Optimized Story Sequence:
------------------------------------------------------------
Story                          Score      Verbs      Vocab     
------------------------------------------------------------
story_sunset_wedding_blues     150        33         84        
story_better_than_a_movie      151        33         85        
story_unexpected_wedding_guests 152        32         88        
story_workplace_stress_vacation 159        37         85        
story_the_marathon             169        40         89        
story_midnight_garden_mystery  174        35         104       
story_teaching_music           174        35         104       
story_job_interview_gone_wrong 178        39         100       
story_winter_in_sweden         178        37         104       
story_a_fishing_trip           179        36         107       
story_team_spirit              180        44         92        
story_brussels_blend           180        39         102       
story_rainy_footb

In [20]:
final_phrases = []
for story_name, phrase_data_list in assignments.items():
    final_phrases.extend([phrase_data['phrase'] for phrase_data in phrase_data_list])

In [23]:
plot_vocabulary_growth(final_phrases)

In [25]:
from src.gcs_storage import get_story_collection_path
upload_to_gcs(obj = assignments,
            bucket_name=config.GCS_PRIVATE_BUCKET,
            file_name = get_story_collection_path(collection="LM1000"))

'gs://audio-language-trainer-private-content/collections/LM1000/LM1000.json'

In [24]:
assignments

{'story_sunset_wedding_blues': [{'phrase': "Don't worry, the alarm will sound if there's danger",
   'score': 39.0,
   'new_story_verbs': 4,
   'new_story_vocab': 4,
   'new_global_verbs': 5,
   'new_global_vocab': 6,
   'total_new_words': 11},
  {'phrase': "Let's go to bed early so we feel better tomorrow",
   'score': 33.0,
   'new_story_verbs': 3,
   'new_story_vocab': 5,
   'new_global_verbs': 3,
   'new_global_vocab': 8,
   'total_new_words': 11},
  {'phrase': "I know exactly what you're talking about",
   'score': 26.0,
   'new_story_verbs': 2,
   'new_story_vocab': 5,
   'new_global_verbs': 2,
   'new_global_vocab': 5,
   'total_new_words': 7},
  {'phrase': 'Still, I think we should give it a try.',
   'score': 23.0,
   'new_story_verbs': 2,
   'new_story_vocab': 3,
   'new_global_verbs': 3,
   'new_global_vocab': 4,
   'total_new_words': 7},
  {'phrase': 'Have you seen the feast they prepared for the wedding?',
   'score': 23.0,
   'new_story_verbs': 2,
   'new_story_vocab': 3,