In [1]:
%load_ext autoreload

In [3]:
%autoreload 2
from dotenv import load_dotenv
import importlib
import sys
import os
import pickle
from pathlib import Path
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.config_loader import config, VoiceManager
from src.phrase import generate_phrases_with_llm, generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases
from src.utils import (load_json, create_test_story_dict, anthropic_generate, save_text_file,
load_text_file, save_json, load_json, load_pickle)
from src.convert import clean_filename
from src.gcs_storage import upload_to_gcs, get_phrase_path, get_phrase_index_path, read_from_gcs, get_phrase_to_story_index_path, get_story_collection_path, get_story_dialogue_path
from src.anki_tools import AnkiCollectionReader, export_to_anki_with_images, get_deck_contents
from src.dialogue_generation import get_story_prompt, generate_story
from src.config_loader import config, VoiceManager, VoiceInfo, VoiceType, VoiceProvider
from src.generate import add_audio, add_translations
from src.nlp import plot_vocabulary_growth, calculate_new_words
from pprint import pprint
import random
import os


## Sorting stories

We need to now get stories from the story collection path and refer to the phrase.json and index.json files in order to help sort them

In [4]:
from src.gcs_storage import get_phrase_index_path, get_story_index_path

story_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_collection_path(collection="LM1000"))
LM1000_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_path(collection="LM1000"))
phrase_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_index_path(collection="LM1000"))
story_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))



In [None]:
from src.phrase import build_phrase_to_story_index
# Build the index
phrase_to_stories = build_phrase_to_story_index(collection="LM1000")

# Save it to GCS
upload_to_gcs(
    phrase_to_stories,
    bucket_name=config.GCS_PRIVATE_BUCKET,
    file_name=get_phrase_to_story_index_path(collection="LM1000"),
)

In [None]:
#phrases without a story:

phrases_without_story = []
for phrase in LM1000_phrases:
    if clean_filename(phrase) not in phrase_to_stories:
        phrases_without_story.append(phrase)



In [35]:
# let's get all story dialogue
from collections import defaultdict

all_story_dialogue = defaultdict(dict)
for story_name in story_phrases:
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_dialogue_path(story_name, collection="LM1000"))
    all_story_dialogue[story_name] = story_dialogue


In [94]:
all_story_dialogue.keys()

dict_keys(['story_the_birthday_party', 'story_camping_trip_gone_awry', 'story_a_fishing_trip', 'story_job_interview_gone_wrong', 'story_midnight_garden_mystery', 'story_rainy_football_match', 'story_sunset_wedding_blues', 'story_team_spirit', 'story_winter_in_sweden', 'story_forgetful_diver', 'story_unexpected_career_change', 'story_brussels_blend', 'story_road_trip', 'story_the_marathon', 'story_better_than_a_movie', 'story_teaching_music', 'story_the_power_cut', 'story_a_missed_stop', 'story_unexpected_wedding_guests', 'story_workplace_stress_vacation'])

In [95]:
from src.gcs_storage import get_story_index_path
from src.nlp import create_story_index

story_index = create_story_index(all_story_dialogue)



Indexing stories...: 100%|██████████| 20/20 [10:08<00:00, 30.43s/it]


In [96]:
upload_to_gcs(story_index, config.GCS_PRIVATE_BUCKET, get_story_index_path(collection="LM1000"))


'gs://audio-language-trainer-private-content/collections/LM1000/story_index.json'

In [108]:
len(story_index['story_vocab']['story_the_birthday_party']['verbs'])

45

In [23]:
from src.nlp import assign_phrases_to_stories
assignments = assign_phrases_to_stories(
    phrase_index=phrase_index,
    story_index=story_index,
    max_phrases_per_story=50,
    top_n=5
)


Assignment Statistics:
Total phrases: 841
Remaining unassigned phrases: 350

Phrases per story:
story_the_birthday_party: 27 phrases
  Verb coverage: 120.0%
  Vocab coverage: 92.6%
story_camping_trip_gone_awry: 28 phrases
  Verb coverage: 135.1%
  Vocab coverage: 92.9%
story_a_fishing_trip: 23 phrases
  Verb coverage: 116.7%
  Vocab coverage: 90.7%
story_job_interview_gone_wrong: 22 phrases
  Verb coverage: 97.4%
  Vocab coverage: 93.0%
story_midnight_garden_mystery: 26 phrases
  Verb coverage: 142.9%
  Vocab coverage: 90.4%
story_rainy_football_match: 21 phrases
  Verb coverage: 90.7%
  Vocab coverage: 90.5%
story_sunset_wedding_blues: 22 phrases
  Verb coverage: 133.3%
  Vocab coverage: 91.7%
story_team_spirit: 20 phrases
  Verb coverage: 90.9%
  Vocab coverage: 91.3%
story_winter_in_sweden: 26 phrases
  Verb coverage: 129.7%
  Vocab coverage: 92.3%
story_forgetful_diver: 28 phrases
  Verb coverage: 113.6%
  Vocab coverage: 91.6%
story_unexpected_career_change: 25 phrases
  Verb cov

In [101]:
841/20

42.05

In [24]:
from src.nlp import analyze_phrase_story_vocabulary_overlap
analysis = analyze_phrase_story_vocabulary_overlap(
    assignments=assignments,
    story_index=story_index,
    phrase_index=phrase_index,
)


=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 82.2%
Target vocabulary covered by flashcards: 54.6%

Verbs needing new flashcards:
['panic', 'enjoy', 'shop', 'order', 'tackle'] ...

Vocabulary needing new flashcards:
['8', 'try', 'cleanup', 'homemade', 'sure'] ...

Vocabulary Analysis for story_the_birthday_party:
Assigned phrases: 27
Verb coverage: 82.2%
Vocabulary coverage: 54.6%
Missing verbs: 8
Missing vocabulary: 49
=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 83.8%
Target vocabulary covered by flashcards: 52.2%

Verbs needing new flashcards:
['grab', 'soak', 'hurry', 'heat', 'pack'] ...

Vocabulary needing new flashcards:
['bend', 'almost', 'cave', 'stove', 'supply'] ...

Vocabulary Analysis for story_camping_trip_gone_awry:
Assigned phrases: 28
Verb coverage: 83.8%
Vocabulary coverage: 52.2%
Missing verbs: 6
Missing vocabulary: 54
=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 80.6%
Target vocabulary

In [106]:
phrase_index['verb_index']['cook']

[225, 421, 358, 398, 463, 52, 468, 246, 533, 376]

In [105]:
phrase_index['phrases'][590]

"Don't you hate waking up early in the morning?"

In [91]:
story_index['verb_index']

{'help': ['story_workplace_stress_vacation',
  'story_job_interview_gone_wrong',
  'story_the_marathon',
  'story_the_power_cut',
  'story_winter_in_sweden',
  'story_the_birthday_party'],
 'can': ['story_brussels_blend',
  'story_teaching_music',
  'story_a_fishing_trip',
  'story_team_spirit',
  'story_midnight_garden_mystery',
  'story_a_missed_stop',
  'story_workplace_stress_vacation',
  'story_winter_in_sweden',
  'story_unexpected_wedding_guests',
  'story_the_marathon',
  'story_rainy_football_match',
  'story_road_trip',
  'story_unexpected_career_change',
  'story_the_power_cut',
  'story_camping_trip_gone_awry',
  'story_sunset_wedding_blues',
  'story_the_birthday_party',
  'story_forgetful_diver'],
 'plan': ['story_teaching_music',
  'story_midnight_garden_mystery',
  'story_winter_in_sweden',
  'story_road_trip',
  'story_unexpected_career_change',
  'story_camping_trip_gone_awry',
  'story_the_birthday_party'],
 'do': ['story_a_fishing_trip',
  'story_team_spirit',
  'st

In [70]:
save_json(phrase_key_to_best_story_match, "./phrase_key_to_best_story_match.json")

In [None]:
len(LM1000_phrases)

In [None]:
for phrase_key in phrase_to_stories:
    #find best story
    best_story = list(phrase_key_to_best_story_match[phrase_key].keys())[0]
    current_story = phrase_to_stories[phrase_key][0]
    if best_story != current_story:
        print(f"{phrase_key} : current {current_story} - best {best_story}")

In [None]:
phrase_key_to_best_story_match

In [None]:
# place
from src.nlp import find_missing_vocabulary, get_vocab_dictionary_from_phrases, get_vocab_dict_from_dialogue
notebook_dir = Path().absolute() 
story_dir = notebook_dir.parent / "outputs" / "stories"
results = dict()
for story_name in all_stories:
    story_path = story_dir / story_name / f"{story_name}.json"
    story_dialogue = load_json(story_path)
    print(f"Doing story: {story_name}")
    vocab_flashcards = get_vocab_dictionary_from_phrases(df_opt.query(f"story == '{story_name}'")['EnglishText'])
    vocab_story = get_vocab_dict_from_dialogue(story_dialogue)
    missing_vocab = find_missing_vocabulary(vocab_flashcards, vocab_story)
    results[story_name] = missing_vocab

In [16]:
# sort so stories with fewest missing vocab come out top
coverage_dict = dict()
for story in results:
    verb_coverage = results[story]['coverage_stats']['verb_coverage']
    vocab_coverage = results[story]['coverage_stats']['vocab_coverage']
    total_coverage = verb_coverage + vocab_coverage
    coverage_dict[story] = total_coverage

sorted_dict = dict(sorted(coverage_dict.items(), key=lambda item: item[1], reverse=True))
story_name_order = list(sorted_dict.keys())

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Set, List, Dict, Tuple
from src.nlp import optimize_stories_and_phrases


In [None]:
df_opt2 = optimize_stories_and_phrases(df_opt, story_name_order)

In [31]:
df_opt2.to_csv("STORY AND PHRASE ORDER.csv")

In [None]:
df_opt2.groupby('story').count().cumsum()

In [None]:
plot_vocabulary_growth(df_opt2['EnglishText'])

In [None]:
plot_vocabulary_growth(df_opt['EnglishText'])

# Save as JSON file

In [4]:
import pandas as pd

df = pd.read_csv("STORY AND PHRASE ORDER.csv")

In [None]:
json_dict = df.groupby("story").apply(lambda x: [phrase for phrase in x['EnglishText']]).to_dict()

In [None]:
config.GCS_PRIVATE_BUCKET

In [None]:
upload_to_gcs(obj = json_dict,
            bucket_name=config.GCS_PRIVATE_BUCKET,
            file_name = "collections/LM1000/LM1000-with-stories.json")