# Story Generation
We remember things better as stories. The plan here is to pick a subset of our phrases, extract the vocabularly, and generate a story based off of them. We can then pull in more flashcards / phrases to ensure a more complete phrase coverage.

The story name will be story_some_title; when added as a 'tag' into Anki, this will add a hyperlink to a google cloud bucket of a specific format of bucket/language/story_name/story_name.html

This means it is easy to add new stories to an existing flashcard deck, and the links will update as soon as you add the tags

In [None]:
%load_ext autoreload
%autoreload 2
from dotenv import load_dotenv
load_dotenv()

PAY_FOR_API = True #change to True to run cells that cost money via API calls

In [None]:
import random
from pathlib import Path
from pprint import pprint
from src.config_loader import config
from src.nlp import (
    create_flashcard_index,
    get_vocab_dict_from_dialogue,
    get_vocab_dictionary_from_phrases,
    find_missing_vocabulary,
)
from src.utils import load_json, load_text_file, save_json, save_pickle, upload_to_gcs
from src.anki_tools import get_deck_contents, AnkiCollectionReader
# Add the parent directory of 'src' to the Python path


### Add directories
story images can be re-used between languages, but audio files are language specific, so we structure the story directory story_name/language with audio files in 'language/' and images and the english JSON file in story_name dir

In [4]:
notebook_dir = Path().absolute()  # This gives src/notebooks
phrase_dir = notebook_dir.parent / "data" / "phrases" #where we store text files of phrases
story_dir = notebook_dir.parent / "outputs" / "stories" # where we store our stories

we already have flashcards generated for some phrases:
a flashcard index allows us to select flashcards that cover a specific vocabulary range, it's quite computationally expensive, but is generated
using create_flashcard_index

In [None]:
PHRASE_LIST_NAME = "longman_1000_phrases"
phrase_file = phrase_dir / f"{PHRASE_LIST_NAME}.txt"
phrases = load_text_file(phrase_file)
pprint(f"First few phrases {phrases[:10]}")



## create the flashcard index
This makes it very fast to find matching flashcards from a given vocab list

In [6]:
# long process, so only create if it doesn't exist
notebook_dir = Path().absolute()  # This gives src/notebooks
index_file = phrase_dir / f"{PHRASE_LIST_NAME}_index.json"

if index_file.exists():
    phrase_index = load_json(index_file)
else:
    phrase_index = create_flashcard_index(phrases)
    save_json(data=phrase_index, file_path=index_file)



## Sample some phrases to generate the story from
This will pin the story to the vocab found in some pre-existing phrases

In [None]:
#we can obtain phrases we know to create a story from:
# NOTE: you must close Anki Desktop when trying to form a connection here
with AnkiCollectionReader() as reader:
    pprint(reader.get_deck_names())

#this will print out deck_id : deck_name -> we want to copy the relevant deck_name

In [None]:
DECK_NAME = "RapidRetention - Swedish - LM1000"
df = get_deck_contents(deck_name=DECK_NAME) #calculates knowledge score
df.head()

Find phrases we know, and limit the flashcard index to those

In [10]:
from src.phrase import get_phrase_indices
known_phrases = df.query("knowledge_score > 0.2").sort_values(by="knowledge_score", ascending=False)['EnglishText'].tolist()

#we need to know the location of each phrase as an integer in the phrase_index
known_phrase_indicies = get_phrase_indices(known_phrases = known_phrases, all_phrases = phrase_index['phrases'])

In [11]:
from src.nlp import remove_unknown_index_values
from copy import deepcopy
#if we don't know a phrase, we don't want to retrieve that from the index and link it to a story
known_index = deepcopy(phrase_index)
known_index['verb_index'] = remove_unknown_index_values(known_phrase_indicies, known_index['verb_index'])
known_index['vocab_index'] = remove_unknown_index_values(known_phrase_indicies, known_index['vocab_index'])

In [12]:
sampled_phrases = random.sample(known_phrases, min(75, len(known_phrases)))
vocab_dict_flashcards = get_vocab_dictionary_from_phrases(sampled_phrases) #75 phrases should give a decent amount of vocab

Now generate the story

In [None]:
from src.dialogue_generation import generate_story
story_name, story_dialogue = generate_story(vocab_dict_flashcards)


In [None]:
clean_story_name = f"story_{story_name.lower().replace(' ', '_')}"
story_path = story_dir / clean_story_name / f"{clean_story_name}.json"

save_json(story_dialogue, story_path)
print(f"saved story to {story_path}")

We find that the LLM goes a bit beyond the vocab found in the flashcards

In [None]:

vocab_dict_story = get_vocab_dict_from_dialogue(story_dialogue, limit_story_parts=None)
vocab_overlap = find_missing_vocabulary(vocab_dict_flashcards, vocab_dict_story)

Let's retrieve flashcards we know that better fit the story vocab

In [None]:
from src.nlp import get_matching_flashcards_indexed

# Let's pull all the existing phrases we need to cover the vocab on our story
#remember we modified the index to only use flashcards we known
known_results = get_matching_flashcards_indexed(vocab_dict_story, known_index)
known_flashcards = [card.get('phrase') for card in known_results['selected_cards']]
print("Average knowledge: ", df.loc[df['EnglishText'].isin(known_flashcards)].knowledge_score.mean())
known_vocab_dict = get_vocab_dictionary_from_phrases(known_flashcards)
missing_vocab = find_missing_vocabulary(vocab_dict_source=known_vocab_dict, vocab_dict_target=vocab_dict_story)
missing_vocab_dict = missing_vocab["missing_vocab"]


Now supplement these with any remaining flascards we don't yet know

In [None]:
#we should have a higher match in the cell above, we can now draw missing flashcards from the full index

additional_results = get_matching_flashcards_indexed(missing_vocab_dict, phrase_index)
additional_flashcards = [card.get('phrase') for card in additional_results['selected_cards']]
print(len(additional_flashcards))

all_flashcards = additional_flashcards + known_flashcards
all_flashcards_vocab_dict = get_vocab_dictionary_from_phrases(all_flashcards)
final_missing_vocab = find_missing_vocabulary(all_flashcards_vocab_dict, vocab_dict_story)


In [None]:
print(f"We need {len(all_flashcards)} flashcards to cover the story")

## Generate the story files
Once you are happy with the flashcard coverage, you can:
* translate and add audio
* create the story images
* create the story album files (M4a files with synced lyrics)
* create the story HTML file using those previous files, and upload to Google Cloud Storage
* tag the flascards with the story name...this will then mean you can link to the story from within Anki (the template uses tags to auto-create hyperlinks)

In [19]:
#if you generate a specific story you can just load it here:
# clean_story_name = "story_roblox_bot_trouble"
# story_dialogue = load_json(story_dir / clean_story_name / f"{clean_story_name}.json" )

In [None]:
from src.generate import add_translations, add_audio
story_dialogue_audio = add_translations(story_dialogue)
story_dialogue_audio = add_audio(story_dialogue_audio)

In [24]:
#this has target language content in now so we save in language dir
save_pickle(data=story_dialogue_audio, file_path=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.pkl")

Image files for each part of the story:

In [None]:
from src.images import generate_and_save_story_images
    
image_data = generate_and_save_story_images(story_dict=story_dialogue, output_dir = story_dir / clean_story_name, story_name=clean_story_name)


In [None]:
image_data

M4A audio files which you will be able to download and play via a media player.
They have synced lyrics which can be viewed in the Oto Music Player app

In [None]:
from PIL import Image
from src.story import create_album_files
FIRST_STORY_PART = list(image_data.keys())[0]
#may need to change depending on size of story made and what parts there are
album_image = Image.open(story_dir / clean_story_name / f"{clean_story_name}_{FIRST_STORY_PART}.png")
#create m4a file:
create_album_files(story_data_dict=story_dialogue_audio, cover_image=album_image, output_dir=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME, story_name=clean_story_name)

Now we generate the main html file - this wraps up the M4A files and image files within it, so it's self-contained

In [None]:
from src.story import create_html_story

create_html_story(
            story_dialogue_audio,
            story_dir / clean_story_name, #the langauge sub-folders will be picked up automatically
            component_path="../src/StoryViewer.js",
            story_name=clean_story_name,
        )

Upload to a public google cloud bucket

In [None]:
html_story_path = story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.html"
upload_to_gcs(html_file_path=html_story_path)

## Linking stories to flash cards
We will use the Anki tag feature. Given a list of english phrases that are required to understand a story, we can tag each of those phrases within a specific Anki Deck.

The card template will turn any tag starting story_ into a hyperlink to the public google cloud bucket

In [None]:
#sometimes this needs running twice...
from src.anki_tools import add_tag_to_matching_notes
updates, errors = add_tag_to_matching_notes(
    deck_name=DECK_NAME,
    phrases=all_flashcards,
    tag=clean_story_name
)

print(f"Updated {updates} notes")
if errors:
    print("Errors encountered:")
    for error in errors:
        print(f"- {error}")

In [None]:
df_deck = get_deck_contents(DECK_NAME)
df_deck.query("tags == @clean_story_name").shape

In [None]:
#we should know most of the vocab...
df_deck.query("tags == @clean_story_name").knowledge_score.hist()