# Story Generation
We remember things better as stories. The plan here is to pick a subset of our phrases, extract the vocabularly, and generate a story based off of them. We can then pull in more flashcards / phrases to ensure a more complete phrase coverage.

The story name will be story_some_title; when added as a 'tag' into Anki, this will add a hyperlink to a google cloud bucket of a specific format of bucket/language/story_name/story_name.html

This means it is easy to add new stories to an existing flashcard deck, and the links will update as soon as you add the tags

In [None]:
%load_ext autoreload
%autoreload 2
from dotenv import load_dotenv

load_dotenv()

PAY_FOR_API = True #change to True to run cells that cost money via API calls

In [None]:
import random
from pathlib import Path
from pprint import pprint

from src.anki_tools import AnkiCollectionReader, get_deck_contents
from src.config_loader import config
from src.nlp import (
    create_flashcard_index,
    find_missing_vocabulary,
    get_vocab_dict_from_dialogue,
    get_vocab_dictionary_from_phrases,
    get_index_subset
)
from src.utils import (
    load_json,
    load_text_file,
    save_json,
    save_pickle,
    load_pickle,
    upload_story_to_gcs,
    upload_to_gcs,
)

from src.phrase import get_phrase_indices
from copy import deepcopy


### Add directories
story images can be re-used between languages, but audio files are language specific, so we structure the story directory story_name/language with audio files in 'language/' and images and the english JSON file in story_name dir

In [None]:
notebook_dir = Path().absolute()  # This gives src/notebooks
phrase_dir = notebook_dir.parent / "data" / "phrases" #where we store text files of phrases
story_dir = notebook_dir.parent / "outputs" / "stories" # where we store our stories

we already have flashcards generated for some phrases:
a flashcard index allows us to select flashcards that cover a specific vocabulary range, it's quite computationally expensive, but is generated
using create_flashcard_index

In [None]:
PHRASE_LIST_NAME = "longman_1000_phrases"
phrase_file = phrase_dir / f"{PHRASE_LIST_NAME}.txt"
phrases = load_text_file(phrase_file)
pprint(f"First few phrases {phrases[:10]}")



## create the flashcard index
This makes it very fast to find matching flashcards from a given vocab list

In [None]:
# long process, so only create if it doesn't exist
notebook_dir = Path().absolute()  # This gives src/notebooks
index_file = phrase_dir / f"{PHRASE_LIST_NAME}_index.json"

if index_file.exists():
    phrase_index = load_json(index_file)
else:
    phrase_index = create_flashcard_index(phrases)
    save_json(data=phrase_index, file_path=index_file)



## Sample some phrases to generate the story from
This will pin the story to the vocab found in some pre-existing phrases

In [None]:
#we can obtain phrases we know to create a story from:
# NOTE: you must close Anki Desktop when trying to form a connection here
with AnkiCollectionReader() as reader:
    pprint(reader.get_deck_names())

#this will print out deck_id : deck_name -> we want to copy the relevant deck_name

## RESTART here to refresh the list of phrases without tags

In [None]:
DECK_NAME = "RapidRetention - Swedish::LM1000"
df = get_deck_contents(deck_name=DECK_NAME) #calculates knowledge score
df.head()

In [None]:
print(f"""{df.query("tags == ''").shape[0]} phrases left""")

# We want to arrive at all phrases assigned to stories (via tags)
So we create an untagged index - an index of flashcards that do not have a tag. We will use these to link to story vocabularly

In [None]:

phrases_with_tags = df.query("tags != ''")['EnglishText'].tolist()
phrases_without_tags = df.query("tags == ''")['EnglishText'].tolist()
#how many words are yet to be assigned to a story?
available_vocab = get_vocab_dictionary_from_phrases(phrases_without_tags)
print(len(available_vocab['verbs']),  len(available_vocab['vocab']))

#we need to know the location of each phrase as an integer in the phrase_index
phrases_without_tags_indicies = get_phrase_indices(known_phrases = phrases_without_tags, all_phrases = phrase_index['phrases'])

#if we already have a phrase linked to a story, we don't want to retrieve that from the index and link it to a story
untagged_index = deepcopy(phrase_index)
untagged_index['verb_index'] = get_index_subset(phrases_without_tags_indicies, untagged_index['verb_index'])
untagged_index['vocab_index'] = get_index_subset(phrases_without_tags_indicies, untagged_index['vocab_index'])

## If generating a new story - random sample some new phrases

We want to sample from phrases that have no tags

In [None]:
sampled_phrases = random.sample(phrases_without_tags, min(100, len(phrases_without_tags)))

#or use sampled_phrases
vocab_dict_flashcards = get_vocab_dictionary_from_phrases(sampled_phrases) #75 phrases should give a decent amount of vocab

Now generate the story

In [None]:
from src.dialogue_generation import generate_story

story_name, story_dialogue = generate_story(vocab_dict_flashcards)


## If using pre-generated story that we want to assign tags to?
Then overwrite the story name and load the json dialogue file

In [None]:
#story_name = "unexpected_music_project"
clean_story_name = f"story_{story_name.lower().replace(' ', '_')}"

story_path = story_dir / clean_story_name / f"{clean_story_name}.json"

#story_dialogue = load_json(story_path)
save_json(story_dialogue, story_path)
print(f"saved {clean_story_name} to {story_path}")

vocab_dict_story = get_vocab_dict_from_dialogue(story_dialogue, limit_story_parts=None)

Let's retrieve flashcards we know that better fit the story vocab

In [None]:
from src.nlp import get_matching_flashcards_indexed

# Let's find the minimal set of flashcards that we need to learn for the story
candidate_flashcards = get_matching_flashcards_indexed(vocab_dict_story, untagged_index)
candidate_phrases = [card.get('phrase') for card in candidate_flashcards['selected_cards']]


We can check the coverage below, we want stories to stretch learners so 70% ish is fine

In [None]:

known_vocab_dict = get_vocab_dictionary_from_phrases(candidate_phrases)
missing_vocab = find_missing_vocabulary(vocab_dict_source=known_vocab_dict, vocab_dict_target=vocab_dict_story)
missing_vocab_dict = missing_vocab["missing_vocab"]


Now supplement these with any remaining flascards we don't yet know

# Add tags to the flashcard deck

In [None]:
print(f"We are going to add '{clean_story_name}' tag to {len(candidate_phrases)} phrases withing '{DECK_NAME}'")

In [None]:
#sometimes this needs running twice...
from src.anki_tools import add_tag_to_matching_notes

updates, errors = add_tag_to_matching_notes(
    deck_name=DECK_NAME,
    phrases=candidate_phrases,
    tag=clean_story_name
)

print(f"Updated {updates} notes")
if errors:
    print("Errors encountered:")
    for error in errors:
        print(f"- {error}")

## Repeat
We can now update phrases without tags at the top of this notebook and generate another story

## Generate the story files
Once you are happy with the flashcard coverage, you can:
* translate and add audio
* create the story images
* create the story album files (M4a files with synced lyrics)
* create the story HTML file using those previous files, and upload to Google Cloud Storage
* tag the flascards with the story name...this will then mean you can link to the story from within Anki (the template uses tags to auto-create hyperlinks)

In [None]:
str(df['TargetText'].sample().values[0])

In [None]:
from src.config_loader import config

In [None]:
config._load_config()

In [None]:
vm.

In [None]:
config.get_voice_models()

In [None]:
from src.audio_generation import text_to_speech

text_to_speech("Hej! Hur mår du idag? Jag hoppas att allt är bra med dig. Det är en vacker dag ute, och jag tänkte ta en promenad i parken senare.", config_language="target", gender="MALE")

In [None]:
# "story_midnight_garden_mystery"
clean_story_name = "story_workplace_stress_vacation"
# story_unexpected_music_project
# story_rainy_football_match 
# story_unexpected_train_adventure 
# story_unexpected_marathon_adventure
# story_sunset_wedding_blues 
# story_unexpected_wedding_guests 
# story_unexpected_career_change
# story_unexpected_coffee_adventure
# story_unexpected_movie_adventure
# story_surprise_hospital_adventure
# story_unexpected_power_outage

print(f"About to generate {clean_story_name}")

In [None]:
story_path = story_dir / clean_story_name / f"{clean_story_name}.json"
story_dialogue = load_json(story_path)

In [None]:
from src.generate import add_audio, add_translations

story_dialogue_audio = add_translations(story_dialogue)
story_dialogue_audio = add_audio(story_dialogue_audio)

In [None]:
#this has target language content in now so we save in language dir
save_pickle(data=story_dialogue_audio, file_path=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.pkl")
#story_dialogue_audio = load_pickle(story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.pkl")

Image files for each part of the story:

In [None]:
from src.images import generate_and_save_story_images
    
image_data = generate_and_save_story_images(story_dict=story_dialogue_audio, output_dir = story_dir / clean_story_name, story_name=clean_story_name)


M4A audio files which you will be able to download and play via a media player.
They have synced lyrics which can be viewed in the Oto Music Player app

In [None]:
from PIL import Image
from src.story import create_album_files, generate_index_html

FIRST_STORY_PART = list(story_dialogue_audio.keys())[0]
#may need to change depending on size of story made and what parts there are
album_image = Image.open(story_dir / clean_story_name / f"{clean_story_name}_{FIRST_STORY_PART}.png")
#create m4a file:
create_album_files(story_data_dict=story_dialogue_audio, cover_image=album_image, output_dir=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME, story_name=clean_story_name)

Now we generate the main html file - this wraps up the M4A files and image files within it, so it's self-contained

In [None]:
from src.story import create_html_story

create_html_story(
            story_data_dict=story_dialogue_audio,
            image_dir=story_dir / clean_story_name, #the langauge sub-folders will be picked up automatically
            story_name=clean_story_name,
        )

Upload to a public google cloud bucket

In [None]:
html_story_path = story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.html"
assert html_story_path.exists()
upload_story_to_gcs(html_file_path=html_story_path)

Now update and reupload our index.html - which allows users to navigate all the stories

In [None]:
generate_index_html()
#will default to public GCS bucket
upload_to_gcs(
    file_path="../outputs/stories/index.html",
    content_type="text/html"
)


In [None]:
stories = [
    # "story_workplace_stress_vacation",
    # "story_unexpected_music_project",
    # "story_rainy_football_match", 
    # "story_unexpected_train_adventure", 
    # "story_unexpected_marathon_adventure",
    # "story_sunset_wedding_blues", 
    # "story_unexpected_wedding_guests", 
    "story_unexpected_career_change",
    "story_unexpected_coffee_adventure",
    "story_unexpected_movie_adventure",
    "story_surprise_hospital_adventure",
    "story_unexpected_power_outage"
]

for clean_story_name in stories:
    print(f"About to generate {clean_story_name}")
    story_path = story_dir / clean_story_name / f"{clean_story_name}.json"
    story_dialogue = load_json(story_path)
    story_dialogue_audio = add_translations(story_dialogue)
    story_dialogue_audio = add_audio(story_dialogue_audio)
    save_pickle(data=story_dialogue_audio, file_path=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.pkl")
    image_data = generate_and_save_story_images(story_dict=story_dialogue_audio, output_dir = story_dir / clean_story_name, story_name=clean_story_name)
    FIRST_STORY_PART = list(story_dialogue_audio.keys())[0]
    #may need to change depending on size of story made and what parts there are
    album_image = Image.open(story_dir / clean_story_name / f"{clean_story_name}_{FIRST_STORY_PART}.png")
    #create m4a file:
    create_album_files(story_data_dict=story_dialogue_audio, cover_image=album_image, output_dir=story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME, story_name=clean_story_name)
    create_html_story(
            story_data_dict=story_dialogue_audio,
            image_dir=story_dir / clean_story_name, #the langauge sub-folders will be picked up automatically
            story_name=clean_story_name,
        )
    html_story_path = story_dir / clean_story_name / config.TARGET_LANGUAGE_NAME / f"{clean_story_name}.html"
    assert html_story_path.exists()
    upload_story_to_gcs(html_file_path=html_story_path)
    generate_index_html()
    #will default to public GCS bucket
    upload_to_gcs(
        file_path="../outputs/stories/index.html",
        content_type="text/html"
    )