In [2]:
%load_ext autoreload

In [3]:
%autoreload 2
from dotenv import load_dotenv
import importlib
import sys
import os
import pickle
from pathlib import Path
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.config_loader import config, VoiceManager
from src.phrase import generate_phrases_with_llm, generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases
from src.utils import (load_json, construct_gcs_path, upload_to_gcs, upload_story_to_gcs,
create_test_story_dict, anthropic_generate, save_text_file,
load_text_file, save_json, load_json, upload_to_gcs, load_pickle)
from src.anki_tools import convert_anki_to_story_dict, AnkiCollectionReader, export_to_anki_with_images, get_deck_contents
from src.dialogue_generation import get_story_prompt, generate_story
from src.config_loader import config, VoiceManager, VoiceInfo, VoiceType, VoiceProvider
from src.generate import add_audio, add_translations
from src.nlp import plot_vocabulary_growth, calculate_new_words
from pprint import pprint
import random
import os


setting voice override: sv-SE-SofieNeural
setting voice override: sv-SE-MattiasNeural
FFmpeg path added to system PATH: C:\Program Files\ffmpeg-7.0-essentials_build\bin


In [4]:
reader = AnkiCollectionReader()
reader.connect()
DECK_NAMES = reader.get_deck_names()
#notes = reader.get_notes_for_deck(DECK_NAMES[0])
reader.close()


In [5]:
prefix = "RapidRetention - Swedish::LM1000::"
matching_decks = []
for key, name in DECK_NAMES.items():
    if prefix in name:
        matching_decks.append(name)


In [6]:
matching_decks

['RapidRetention - Swedish::LM1000::Train Adventure',
 'RapidRetention - Swedish::LM1000::Fishing Trip',
 'RapidRetention - Swedish::LM1000::Sunset Wedding',
 'RapidRetention - Swedish::LM1000::Marathon',
 'RapidRetention - Swedish::LM1000::Job Interview',
 'RapidRetention - Swedish::LM1000::Coffee Adventure',
 'RapidRetention - Swedish::LM1000::Community Park',
 'RapidRetention - Swedish::LM1000::Power Outage',
 'RapidRetention - Swedish::LM1000::Winter Wilderness',
 'RapidRetention - Swedish::LM1000::Workplace Stress',
 'RapidRetention - Swedish::LM1000::Camping Trip',
 'RapidRetention - Swedish::LM1000::Birthday Party',
 'RapidRetention - Swedish::LM1000::Music Project',
 'RapidRetention - Swedish::LM1000::Movie Time',
 'RapidRetention - Swedish::LM1000::Wedding Guests',
 'RapidRetention - Swedish::LM1000::Midnight Garden',
 'RapidRetention - Swedish::LM1000::Underwater',
 'RapidRetention - Swedish::LM1000::Career Change',
 'RapidRetention - Swedish::LM1000::Unexpected Holiday',
 'R

In [7]:
all_content = []
for deck in matching_decks:
    df = get_deck_contents(deck)
    all_content.append(df)

In [8]:
import pandas as pd

In [9]:
df_lm1000 = pd.concat(all_content)

In [10]:
df_lm1000.columns

Index(['note_id', 'model_name', 'tags', 'n_cards', 'avg_ease', 'total_reps',
       'avg_reps', 'total_lapses', 'avg_lapses', 'avg_interval', 'TargetText',
       'TargetAudio', 'TargetAudioSlow', 'EnglishText', 'WiktionaryLinks',
       'Picture', 'TargetLanguageName', 'knowledge_score'],
      dtype='object')

In [11]:
df_opt = df_lm1000[['tags', 'EnglishText']]
df_opt.columns =["story", "EnglishText"]

In [12]:
plot_vocabulary_growth(df_opt['EnglishText'])

In [13]:
all_stories = df_opt['story'].unique()

In [15]:
# find missing vocab per story
from src.nlp import find_missing_vocabulary, get_vocab_dictionary_from_phrases, get_vocab_dict_from_dialogue
notebook_dir = Path().absolute() 
story_dir = notebook_dir.parent / "outputs" / "stories"
results = dict()
for story_name in all_stories:
    story_path = story_dir / story_name / f"{story_name}.json"
    story_dialogue = load_json(story_path)
    print(f"Doing story: {story_name}")
    vocab_flashcards = get_vocab_dictionary_from_phrases(df_opt.query(f"story == '{story_name}'")['EnglishText'])
    vocab_story = get_vocab_dict_from_dialogue(story_dialogue)
    missing_vocab = find_missing_vocabulary(vocab_flashcards, vocab_story)
    results[story_name] = missing_vocab

Doing story: story_unexpected_train_adventure
=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 55.8%
Target vocabulary covered by flashcards: 50.5%

Verbs needing new flashcards:
['get', 'pack', 'explore', 'relax', 'know'] ...

Vocabulary needing new flashcards:
['instead', 'there', 'sometimes', 'unexpected', 'yes'] ...
Doing story: story_fishing_trip_gone_awry
=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 91.7%
Target vocabulary covered by flashcards: 70.1%

Verbs needing new flashcards:
['retrace', 'bite', 'sound'] 

Vocabulary needing new flashcards:
['lifesaver', 'sometimes', 'agreed', 'yes', 'absolutely'] ...
Doing story: story_sunset_wedding_blues
=== VOCABULARY COVERAGE ANALYSIS ===
Target verbs covered by flashcards: 60.6%
Target vocabulary covered by flashcards: 52.4%

Verbs needing new flashcards:
['believe', 'know', 'remember', 'feel', 'speak'] ...

Vocabulary needing new flashcards:
['inspiring', 'there', 'thank', 'everythi

In [16]:
# sort so stories with fewest missing vocab come out top
coverage_dict = dict()
for story in results:
    verb_coverage = results[story]['coverage_stats']['verb_coverage']
    vocab_coverage = results[story]['coverage_stats']['vocab_coverage']
    total_coverage = verb_coverage + vocab_coverage
    coverage_dict[story] = total_coverage

sorted_dict = dict(sorted(coverage_dict.items(), key=lambda item: item[1], reverse=True))
story_name_order = list(sorted_dict.keys())

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Set, List, Dict, Tuple
from src.nlp import optimize_stories_and_phrases


In [22]:
df_opt2 = optimize_stories_and_phrases(df_opt, story_name_order)

Processing stories...


Parsing phrases: 100%|██████████| 57/57 [00:00<00:00, 123.38it/s]
Parsing phrases: 100%|██████████| 53/53 [00:00<00:00, 151.86it/s]
Parsing phrases: 100%|██████████| 81/81 [00:00<00:00, 152.60it/s]
Parsing phrases: 100%|██████████| 51/51 [00:00<00:00, 161.91it/s]
Parsing phrases: 100%|██████████| 59/59 [00:00<00:00, 171.02it/s]
Parsing phrases: 100%|██████████| 52/52 [00:00<00:00, 180.56it/s]
Parsing phrases: 100%|██████████| 37/37 [00:00<00:00, 170.50it/s]
Parsing phrases: 100%|██████████| 33/33 [00:00<00:00, 177.43it/s]
Parsing phrases: 100%|██████████| 18/18 [00:00<00:00, 159.26it/s]
Parsing phrases: 100%|██████████| 38/38 [00:00<00:00, 187.17it/s]
Parsing phrases: 100%|██████████| 50/50 [00:00<00:00, 191.57it/s]
Parsing phrases: 100%|██████████| 33/33 [00:00<00:00, 192.98it/s]
Parsing phrases: 100%|██████████| 24/24 [00:00<00:00, 200.00it/s]
Parsing phrases: 100%|██████████| 18/18 [00:00<00:00, 199.94it/s]
Parsing phrases: 100%|██████████| 35/35 [00:00<00:00, 199.96it/s]
Parsing ph

Optimising phrases within stories...


100%|██████████| 21/21 [00:01<00:00, 13.77it/s]


In [31]:
df_opt2.to_csv("STORY AND PHRASE ORDER.csv")

In [30]:
df_opt2.groupby('story').count().cumsum()

Unnamed: 0_level_0,EnglishText,new_words,sequence_position
story,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
story_birthday_party_planning_mishap,57,57,57
story_camping_trip_gone_awry,110,110,110
story_community_park,191,191,191
story_fishing_trip_gone_awry,242,242,242
story_job_interview_gone_wrong,301,301,301
story_midnight_garden_mystery,353,353,353
story_rainy_football_match,390,390,390
story_sunset_wedding_blues,423,423,423
story_surprise_hospital_adventure,441,441,441
story_swedish_adventure_in_winter_wilderness,479,479,479


In [24]:
plot_vocabulary_growth(df_opt2['EnglishText'])

In [25]:
plot_vocabulary_growth(df_opt['EnglishText'])