# Story Generation
We remember things better as stories. The plan here is to pick a subset of our phrases, extract the vocabularly, and generate a story based off of them. We can then pull in more flashcards / phrases to ensure a more complete phrase coverage

In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import pickle
import random
import sys
from pathlib import Path
from pprint import pprint

from dotenv import load_dotenv

from src.config_loader import config
from src.nlp import (
    create_flashcard_index,
    get_vocab_dict_from_dialogue,
    get_vocab_dictionary_from_phrases,
)
from src.utils import load_json, load_text_file, save_json

load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:


filepath = "../data/longman_1000_phrases.txt"
phrases = load_text_file(filepath)
pprint(f"First few phrases {phrases[:10]}")

#we already have flashcards generated for some phrases:
#a flashcard index allows us to select flashcards that cover a specific
#vocabulary range, it's quite computationally expensive, but is generated
#using create_flashcard_index



## create the flashcard index
This makes it very fast to find matching flashcards from a given vocab list

In [None]:
# long process, so only create if it doesn't exist
notebook_dir = Path().absolute()  # This gives src/notebooks
data_dir = notebook_dir.parent / "data" / "longman_1000_phrase_index.json"

if data_dir.exists():
    phrase_index = load_json(data_dir)
else:
    phrase_index = create_flashcard_index(phrases)
    save_json(phrase_index, data_dir)


## Sample some phrases to generate the story from
This will pin the story to the vocab found in some pre-existing phrases

In [None]:
vocab_dict_flashcards = get_vocab_dictionary_from_phrases(phrases[:50])

Now generate the story

In [None]:
from src.dialogue_generation import generate_story

story_path = notebook_dir.parent / "data" / "stories" / "test_story" / "story_community_park.json"

if story_path.exists():
    story_50_phrases = load_json(story_path)
else:
    story_50_phrases = generate_story(vocab_dict_flashcards)
    save_json(story_50_phrases, story_path)


We find that the LLM goes a bit beyond the vocab found in the flashcards

In [None]:
from src. nlp import get_vocab_dict_from_dialogue

vocab_dict_story = get_vocab_dict_from_dialogue(story_50_phrases, limit_story_parts=None)

In [None]:
from src.nlp import find_missing_vocabulary

vocab_overlap = find_missing_vocabulary(vocab_dict_flashcards, vocab_dict_story)

In [None]:
from src.nlp import get_matching_flashcards_indexed

# Let's pull all the existing phrases we need to cover the vocab on our story
results = get_matching_flashcards_indexed(vocab_dict_story, phrase_index)

In [None]:
proposed_flashcard_phrases = [card.get('phrase') for card in results['selected_cards']]
vocab_from_new_flashcards = get_vocab_dictionary_from_phrases(proposed_flashcard_phrases)
new_overlap = find_missing_vocabulary(vocab_from_new_flashcards, vocab_dict_story)

In [None]:
#we can fill in the gap with some missing flashcards:

missing_vocab_dict = new_overlap['missing_vocab']
missing_vocab_dict

In [None]:
from src.phrase import generate_phrases_from_vocab_dict

missing_phrases = generate_phrases_from_vocab_dict(missing_vocab_dict)
missing_phrases

In [None]:
num_cards = len(results["selected_cards"])
print(f"We need {num_cards + len(missing_phrases)} flashcards to cover the story")

In [None]:
from src.utils import save_text_file

save_text_file(proposed_flashcard_phrases + missing_phrases, "../data/stories/test_story/test_phrases.txt")

We will need to generate images for the missing phrases, then we can create an anki deck for that particualr story

In [None]:
from src.images import add_images_to_phrases

PAY_FOR_API = True

output_dir = notebook_dir.parent / "data" / "longman_phrase_images" / "longman1000"

if not output_dir.exists():
    print("wrong directory")
    PAY_FOR_API = False

if PAY_FOR_API:
    image_files_and_prompts = add_images_to_phrases(phrases=missing_phrases, output_dir = output_dir)



## Linking stories to flash cards
We will use the Anki tag feature. Given a list of english phrases that are required to understand a story, we can tag each of those phrases within a specific Anki Deck.

In [6]:
#load the phrases
phrases = load_text_file( "../data/stories/test_story/test_phrases.txt")

In [8]:
from src.anki_tools import AnkiCollectionReader

with AnkiCollectionReader() as reader:
    pprint(reader.get_deck_names())

{1: 'Default',
 1731524665442: 'Swedish EAL',
 1731700590019: 'Custom study session',
 1732020971325: 'RapidRetention - Swedish - LM1000',
 1732309563077: 'RapidRetention - Dutch - LM1000',
 1732312948269: 'RapidRetention - German - LM1000',
 1732313960891: 'RapidRetention - Arabic - LM1000',
 1732314196963: 'RapidRetention - Spanish - LM1000',
 1732314413500: 'RapidRetention - Japanese - LM1000',
 1732316149591: 'RapidRetention - Russian - LM1000',
 1732316158895: 'RapidRetention - Basque - LM1000',
 1732316821915: 'RapidRetention - French - LM1000',
 1732316936163: 'RapidRetention - Italian - LM1000',
 1732460522330: 'RapidRetention - Persian - LM1000',
 1732465028917: 'RapidRetention - Mandarin Chinese - LM1000',
 1732637740663: 'RapidRetention - Welsh - LM1000',
 1732869083179: 'RapidRetention - Serbian - LM1000',
 1732980361514: 'RapidRetention - Russian - GCSE',
 1732993700879: 'Persian Alphabet',
 1733170456922: 'RapidRetention - Swedish - GCSE',
 1733171641992: 'RapidRetention 

In [10]:
from src.anki_tools import add_tag_to_matching_notes
deck_name = "RapidRetention - Swedish - LM1000"
updates, errors = add_tag_to_matching_notes(
    deck_name=deck_name,
    phrases=phrases,
    tag="story_community_park"
)

print(f"Updated {updates} notes")
if errors:
    print("Errors encountered:")
    for error in errors:
        print(f"- {error}")

audio-language-trainer\src\anki_tools.py:223:save() is deprecated: saving is automatic
Updated 88 notes
