In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
from dotenv import load_dotenv
import importlib
import sys
import os
import pickle
from pathlib import Path
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.config_loader import config, VoiceManager
from src.phrase import generate_phrases_with_llm, generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases
from src.utils import (load_json, create_test_story_dict, anthropic_generate, save_text_file,
load_text_file, save_json, load_json, load_pickle)
from src.convert import clean_filename
from src.gcs_storage import upload_to_gcs, get_phrase_path, get_phrase_index_path, read_from_gcs, get_phrase_to_story_index_path, get_story_collection_path, get_story_dialogue_path
from src.anki_tools import AnkiCollectionReader, export_to_anki_with_images, get_deck_contents
from src.dialogue_generation import get_story_prompt, generate_story
from src.config_loader import config, VoiceManager, VoiceInfo, VoiceType, VoiceProvider
from src.generate import add_audio, add_translations
from src.nlp import plot_vocabulary_growth, calculate_new_words
from pprint import pprint
import random
import os


setting voice override: fr-FR-Chirp3-HD-Zephyr
setting voice override: fr-FR-Chirp3-HD-Puck
FFmpeg path added to system PATH: C:\Program Files\ffmpeg-7.0-essentials_build\bin


## Sorting stories

We need to now get stories from the story collection path and refer to the phrase.json and index.json files in order to help sort them

In [3]:
from src.gcs_storage import get_phrase_index_path, get_story_index_path

story_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_collection_path(collection="LM1000"))
LM1000_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_path(collection="LM1000"))
phrase_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_index_path(collection="LM1000"))
story_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))



In [None]:
from src.phrase import build_phrase_to_story_index
# Build the index
phrase_to_stories = build_phrase_to_story_index(collection="LM1000")

# Save it to GCS
upload_to_gcs(
    phrase_to_stories,
    bucket_name=config.GCS_PRIVATE_BUCKET,
    file_name=get_phrase_to_story_index_path(collection="LM1000"),
)

In [None]:
#phrases without a story:

phrases_without_story = []
for phrase in LM1000_phrases:
    if clean_filename(phrase) not in phrase_to_stories:
        phrases_without_story.append(phrase)



In [35]:
# let's get all story dialogue
from collections import defaultdict

all_story_dialogue = defaultdict(dict)
for story_name in story_phrases:
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_dialogue_path(story_name, collection="LM1000"))
    all_story_dialogue[story_name] = story_dialogue


In [94]:
all_story_dialogue.keys()

dict_keys(['story_the_birthday_party', 'story_camping_trip_gone_awry', 'story_a_fishing_trip', 'story_job_interview_gone_wrong', 'story_midnight_garden_mystery', 'story_rainy_football_match', 'story_sunset_wedding_blues', 'story_team_spirit', 'story_winter_in_sweden', 'story_forgetful_diver', 'story_unexpected_career_change', 'story_brussels_blend', 'story_road_trip', 'story_the_marathon', 'story_better_than_a_movie', 'story_teaching_music', 'story_the_power_cut', 'story_a_missed_stop', 'story_unexpected_wedding_guests', 'story_workplace_stress_vacation'])

In [95]:
from src.gcs_storage import get_story_index_path
from src.nlp import create_story_index

story_index = create_story_index(all_story_dialogue)



Indexing stories...: 100%|██████████| 20/20 [10:08<00:00, 30.43s/it]


In [96]:
upload_to_gcs(story_index, config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))


'gs://audio-language-trainer-private-content/collections/LM1000/story_index.json'

In [12]:
len(story_index['story_vocab']['story_the_birthday_party']['verbs'])

45

In [9]:
from src.nlp import assign_phrases_to_stories
assignments = assign_phrases_to_stories(
    phrase_index=phrase_index,
    story_index=story_index,
    max_phrases_per_story=int(847/20),
)


Optimized Story Sequence:
------------------------------------------------------------
Story                          Score      Verbs      Vocab     
------------------------------------------------------------
story_sunset_wedding_blues     150        33         84        
story_better_than_a_movie      151        33         85        
story_unexpected_wedding_guests 152        32         88        
story_workplace_stress_vacation 159        37         85        
story_the_marathon             169        40         89        
story_midnight_garden_mystery  174        35         104       
story_teaching_music           174        35         104       
story_job_interview_gone_wrong 178        39         100       
story_winter_in_sweden         178        37         104       
story_a_fishing_trip           179        36         107       
story_team_spirit              180        44         92        
story_brussels_blend           180        39         102       
story_rainy_footb

In [11]:
story_index

{'verb_index': {'help': ['story_workplace_stress_vacation',
   'story_job_interview_gone_wrong',
   'story_the_marathon',
   'story_the_power_cut',
   'story_winter_in_sweden',
   'story_the_birthday_party'],
  'can': ['story_brussels_blend',
   'story_teaching_music',
   'story_a_fishing_trip',
   'story_team_spirit',
   'story_midnight_garden_mystery',
   'story_a_missed_stop',
   'story_workplace_stress_vacation',
   'story_winter_in_sweden',
   'story_unexpected_wedding_guests',
   'story_the_marathon',
   'story_rainy_football_match',
   'story_road_trip',
   'story_unexpected_career_change',
   'story_the_power_cut',
   'story_camping_trip_gone_awry',
   'story_sunset_wedding_blues',
   'story_the_birthday_party',
   'story_forgetful_diver'],
  'plan': ['story_teaching_music',
   'story_midnight_garden_mystery',
   'story_winter_in_sweden',
   'story_road_trip',
   'story_unexpected_career_change',
   'story_camping_trip_gone_awry',
   'story_the_birthday_party'],
  'do': ['story

In [16]:
from src.nlp import get_vocab_dictionary_from_phrases
get_vocab_dictionary_from_phrases([key['phrase'] for key in assignments['story_sunset_wedding_blues']])

{'verbs': ['call',
  'want',
  'could',
  'getting',
  'should',
  'stay',
  'must',
  'get',
  'understand',
  'figure',
  'say',
  'look',
  'enhance',
  'die',
  'know',
  'bring',
  'graduate',
  'put',
  'shall',
  'offer',
  'might',
  'care',
  'watch',
  'try',
  'speak',
  'make',
  'let',
  'forget',
  'improve',
  'grow',
  'pay',
  'work',
  'give',
  'would',
  'change',
  'have',
  'prepare',
  'hang',
  'defend',
  'stop',
  'support',
  'hope',
  'miss',
  'hear',
  'open',
  'marry',
  'believe',
  'can',
  'feel',
  'bother',
  'raise',
  'do',
  'hold',
  'help',
  'think',
  "'ve",
  'win',
  'take',
  'save',
  'shake',
  'hurry',
  'stand',
  'be',
  'worry',
  'start',
  'drop',
  'catch',
  'ensure',
  'face',
  'be',
  'will',
  'have',
  'finish',
  'notice',
  'dive',
  'see',
  'need',
  'break',
  'refer',
  'answer',
  'do',
  'knock',
  'talk',
  'remember',
  'go',
  'sound',
  'head'],
 'vocab': ['her',
  'we',
  'each',
  'programme',
  'tomorrow',
  '

In [106]:
phrase_index['verb_index']['cook']

[225, 421, 358, 398, 463, 52, 468, 246, 533, 376]

In [105]:
phrase_index['phrases'][590]

"Don't you hate waking up early in the morning?"

In [91]:
story_index['verb_index']

{'help': ['story_workplace_stress_vacation',
  'story_job_interview_gone_wrong',
  'story_the_marathon',
  'story_the_power_cut',
  'story_winter_in_sweden',
  'story_the_birthday_party'],
 'can': ['story_brussels_blend',
  'story_teaching_music',
  'story_a_fishing_trip',
  'story_team_spirit',
  'story_midnight_garden_mystery',
  'story_a_missed_stop',
  'story_workplace_stress_vacation',
  'story_winter_in_sweden',
  'story_unexpected_wedding_guests',
  'story_the_marathon',
  'story_rainy_football_match',
  'story_road_trip',
  'story_unexpected_career_change',
  'story_the_power_cut',
  'story_camping_trip_gone_awry',
  'story_sunset_wedding_blues',
  'story_the_birthday_party',
  'story_forgetful_diver'],
 'plan': ['story_teaching_music',
  'story_midnight_garden_mystery',
  'story_winter_in_sweden',
  'story_road_trip',
  'story_unexpected_career_change',
  'story_camping_trip_gone_awry',
  'story_the_birthday_party'],
 'do': ['story_a_fishing_trip',
  'story_team_spirit',
  'st

In [70]:
save_json(phrase_key_to_best_story_match, "./phrase_key_to_best_story_match.json")

In [None]:
len(LM1000_phrases)

In [None]:
for phrase_key in phrase_to_stories:
    #find best story
    best_story = list(phrase_key_to_best_story_match[phrase_key].keys())[0]
    current_story = phrase_to_stories[phrase_key][0]
    if best_story != current_story:
        print(f"{phrase_key} : current {current_story} - best {best_story}")

In [None]:
phrase_key_to_best_story_match

In [None]:
# place
from src.nlp import find_missing_vocabulary, get_vocab_dictionary_from_phrases, get_vocab_dict_from_dialogue
notebook_dir = Path().absolute() 
story_dir = notebook_dir.parent / "outputs" / "stories"
results = dict()
for story_name in all_stories:
    story_path = story_dir / story_name / f"{story_name}.json"
    story_dialogue = load_json(story_path)
    print(f"Doing story: {story_name}")
    vocab_flashcards = get_vocab_dictionary_from_phrases(df_opt.query(f"story == '{story_name}'")['EnglishText'])
    vocab_story = get_vocab_dict_from_dialogue(story_dialogue)
    missing_vocab = find_missing_vocabulary(vocab_flashcards, vocab_story)
    results[story_name] = missing_vocab

In [16]:
# sort so stories with fewest missing vocab come out top
coverage_dict = dict()
for story in results:
    verb_coverage = results[story]['coverage_stats']['verb_coverage']
    vocab_coverage = results[story]['coverage_stats']['vocab_coverage']
    total_coverage = verb_coverage + vocab_coverage
    coverage_dict[story] = total_coverage

sorted_dict = dict(sorted(coverage_dict.items(), key=lambda item: item[1], reverse=True))
story_name_order = list(sorted_dict.keys())

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Set, List, Dict, Tuple
from src.nlp import optimize_stories_and_phrases


In [None]:
df_opt2 = optimize_stories_and_phrases(df_opt, story_name_order)

In [31]:
df_opt2.to_csv("STORY AND PHRASE ORDER.csv")

In [None]:
df_opt2.groupby('story').count().cumsum()

In [None]:
plot_vocabulary_growth(df_opt2['EnglishText'])

In [None]:
plot_vocabulary_growth(df_opt['EnglishText'])

# Save as JSON file

In [4]:
import pandas as pd

df = pd.read_csv("STORY AND PHRASE ORDER.csv")

In [None]:
json_dict = df.groupby("story").apply(lambda x: [phrase for phrase in x['EnglishText']]).to_dict()

In [None]:
config.GCS_PRIVATE_BUCKET

In [None]:
upload_to_gcs(obj = json_dict,
            bucket_name=config.GCS_PRIVATE_BUCKET,
            file_name = "collections/LM1000/LM1000-with-stories.json")