# Story Generation
We remember things better as stories. The plan here is to pick a subset of our phrases, extract the vocabularly, and generate a story based off of them. We can then pull in more flashcards / phrases to ensure a more complete phrase coverage

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

PAY_FOR_API = False #change to True to run cells that cost money via API calls

In [2]:
import os
import pickle
import random
import sys
from pathlib import Path
from pprint import pprint

from dotenv import load_dotenv

from src.config_loader import config
from src.nlp import (
    create_flashcard_index,
    get_vocab_dict_from_dialogue,
    get_vocab_dictionary_from_phrases,
)
from src.utils import load_json, load_text_file, save_json

load_dotenv()
# Add the parent directory of 'src' to the Python path


True

In [None]:


filepath = "../data/longman_1000_phrases.txt"
phrases = load_text_file(filepath)
pprint(f"First few phrases {phrases[:10]}")

#we already have flashcards generated for some phrases:
#a flashcard index allows us to select flashcards that cover a specific
#vocabulary range, it's quite computationally expensive, but is generated
#using create_flashcard_index



## create the flashcard index
This makes it very fast to find matching flashcards from a given vocab list

In [5]:
# long process, so only create if it doesn't exist
notebook_dir = Path().absolute()  # This gives src/notebooks
data_dir = notebook_dir.parent / "data" / "longman_1000_phrase_index.json"

if data_dir.exists():
    phrase_index = load_json(data_dir)
else:
    phrase_index = create_flashcard_index(phrases)
    save_json(phrase_index, data_dir)


## Sample some phrases to generate the story from
This will pin the story to the vocab found in some pre-existing phrases

In [None]:
vocab_dict_flashcards = get_vocab_dictionary_from_phrases(phrases[:50])

Now generate the story

In [6]:
from src.dialogue_generation import generate_story

story_path = notebook_dir.parent / "data" / "stories" / "test_story" / "story_community_park.json"

if story_path.exists():
    story_50_phrases = load_json(story_path)
elif PAY_FOR_API:
    story_50_phrases = generate_story(vocab_dict_flashcards)
    save_json(story_50_phrases, story_path)


We find that the LLM goes a bit beyond the vocab found in the flashcards

In [10]:
story_50_phrases

{'introduction': {'dialogue': [{'speaker': 'Alex',
    'text': "Hey Sam, did you hear about the new park they're building downtown?"},
   {'speaker': 'Sam',
    'text': "No, I haven't. When did they start planning this?"},
   {'speaker': 'Alex',
    'text': 'It was in the newspaper last month. They want to create a beautiful community space.'},
   {'speaker': 'Sam',
    'text': "That sounds great! Do you think it'll have a cycling path?"},
   {'speaker': 'Alex',
    'text': "I hope so. It'd be nice to have a place to ride without worrying about traffic."},
   {'speaker': 'Sam',
    'text': 'Definitely. How about we go check out the location this evening?'},
   {'speaker': 'Alex',
    'text': "Sure, I'm free after work. Should we meet there around 6?"},
   {'speaker': 'Sam',
    'text': "Perfect. I'll bring some snacks for a little picnic if you'd like."},
   {'speaker': 'Alex',
    'text': "That's a great idea. I can bring my guitar too. We could have a mini party!"},
   {'speaker': 'S

In [None]:
from src. nlp import get_vocab_dict_from_dialogue

vocab_dict_story = get_vocab_dict_from_dialogue(story_50_phrases, limit_story_parts=None)

In [None]:
from src.nlp import find_missing_vocabulary

vocab_overlap = find_missing_vocabulary(vocab_dict_flashcards, vocab_dict_story)

In [None]:
from src.nlp import get_matching_flashcards_indexed

# Let's pull all the existing phrases we need to cover the vocab on our story
results = get_matching_flashcards_indexed(vocab_dict_story, phrase_index)

In [None]:
proposed_flashcard_phrases = [card.get('phrase') for card in results['selected_cards']]
vocab_from_new_flashcards = get_vocab_dictionary_from_phrases(proposed_flashcard_phrases)
new_overlap = find_missing_vocabulary(vocab_from_new_flashcards, vocab_dict_story)

In [None]:
#we can fill in the gap with some missing flashcards:

missing_vocab_dict = new_overlap['missing_vocab']
missing_vocab_dict

In [None]:
from src.phrase import generate_phrases_from_vocab_dict

missing_phrases = generate_phrases_from_vocab_dict(missing_vocab_dict)
missing_phrases

In [None]:
num_cards = len(results["selected_cards"])
print(f"We need {num_cards + len(missing_phrases)} flashcards to cover the story")

In [None]:
from src.utils import save_text_file

save_text_file(proposed_flashcard_phrases + missing_phrases, "../data/stories/test_story/test_phrases.txt")

We will need to generate images for the missing phrases, then we can create an anki deck for that particualr story

In [None]:
from src.images import add_images_to_phrases

PAY_FOR_API = True

output_dir = notebook_dir.parent / "data" / "longman_phrase_images" / "longman1000"

if not output_dir.exists():
    print("wrong directory")
    PAY_FOR_API = False

if PAY_FOR_API:
    image_files_and_prompts = add_images_to_phrases(phrases=missing_phrases, output_dir = output_dir)



## Linking stories to flash cards
We will use the Anki tag feature. Given a list of english phrases that are required to understand a story, we can tag each of those phrases within a specific Anki Deck.

In [6]:
#load the phrases
phrases = load_text_file( "../data/stories/test_story/test_phrases.txt")

In [8]:
from src.anki_tools import AnkiCollectionReader

with AnkiCollectionReader() as reader:
    pprint(reader.get_deck_names())

{1: 'Default',
 1731524665442: 'Swedish EAL',
 1731700590019: 'Custom study session',
 1732020971325: 'RapidRetention - Swedish - LM1000',
 1732309563077: 'RapidRetention - Dutch - LM1000',
 1732312948269: 'RapidRetention - German - LM1000',
 1732313960891: 'RapidRetention - Arabic - LM1000',
 1732314196963: 'RapidRetention - Spanish - LM1000',
 1732314413500: 'RapidRetention - Japanese - LM1000',
 1732316149591: 'RapidRetention - Russian - LM1000',
 1732316158895: 'RapidRetention - Basque - LM1000',
 1732316821915: 'RapidRetention - French - LM1000',
 1732316936163: 'RapidRetention - Italian - LM1000',
 1732460522330: 'RapidRetention - Persian - LM1000',
 1732465028917: 'RapidRetention - Mandarin Chinese - LM1000',
 1732637740663: 'RapidRetention - Welsh - LM1000',
 1732869083179: 'RapidRetention - Serbian - LM1000',
 1732980361514: 'RapidRetention - Russian - GCSE',
 1732993700879: 'Persian Alphabet',
 1733170456922: 'RapidRetention - Swedish - GCSE',
 1733171641992: 'RapidRetention 

In [10]:
from src.anki_tools import add_tag_to_matching_notes
deck_name = "RapidRetention - Swedish - LM1000"
updates, errors = add_tag_to_matching_notes(
    deck_name=deck_name,
    phrases=phrases,
    tag="story_community_park"
)

print(f"Updated {updates} notes")
if errors:
    print("Errors encountered:")
    for error in errors:
        print(f"- {error}")

audio-language-trainer\src\anki_tools.py:223:save() is deprecated: saving is automatic
Updated 88 notes


In [34]:
from src.anki_tools import get_deck_contents, add_knowledge_score


df_deck = get_deck_contents(deck_name)

In [36]:
df_deck.sort_values(by="knowledge_score", ascending=False)

Unnamed: 0,note_id,model_name,tags,n_cards,avg_ease,total_reps,avg_reps,total_lapses,avg_lapses,avg_interval,TargetText,TargetAudio,TargetAudioSlow,EnglishText,WiktionaryLinks,Picture,TargetLanguageName,knowledge_score
46,1732020511532,Language Practice With Images++,,3,280.0,9,3.0,0,0.0,53.3,Kommer de att lära oss hur man applicerar smink?,[sound:4442968e-cc2a-4c2d-ae1d-651ef4b60172.mp3],[sound:69bee3d7-f0ee-47b1-94e6-dc1ffc9ae57b.mp3],Are they going to teach us how to apply makeup?,"<a href=""https://en.wiktionary.org/wiki/kommer...","<img src=""5d11ecf0-dbff-4043-88b1-0902abc965a1...",Swedish,0.378
327,1732023524630,Language Practice With Images++,,3,280.0,10,3.3,0,0.0,54.3,Är du redo för festen ikväll?,[sound:06049362-4504-4f34-bdd9-d5213a75ea4c.mp3],[sound:0a402d7a-6360-48ee-953b-565b42469f37.mp3],Are you ready for the party tonight?,"<a href=""https://en.wiktionary.org/wiki/%C3%A4...","<img src=""f32df7ae-328e-4c07-b9a7-1f0d40f7e433...",Swedish,0.378
446,1732024523785,Language Practice With Images++,,3,280.0,10,3.3,0,0.0,53.0,Tänk på gapet mellan tåget och perrongen,[sound:2042fb3d-f849-48dd-be78-70d66190082b.mp3],[sound:08093457-1c8e-4935-90c3-3f7966a55e35.mp3],Mind the gap between the train and the platform,"<a href=""https://en.wiktionary.org/wiki/t%C3%A...","<img src=""4bc37b79-0848-4d89-95b4-1b99c740429f...",Swedish,0.376
36,1732020511492,Language Practice With Images++,story_community_park,3,280.0,10,3.3,0,0.0,52.3,Går du genom parken varje kväll?,[sound:074e3e18-114e-4fe9-9e46-785344942ee4.mp3],[sound:08dcb0de-4fbe-43c7-ab56-d56e4510c235.mp3],Do you walk through the park every evening?,"<a href=""https://en.wiktionary.org/wiki/g%C3%A...","<img src=""70d988fc-0496-4408-848d-e210e739dd47...",Swedish,0.375
280,1732023030318,Language Practice With Images++,,3,280.0,9,3.0,0,0.0,50.0,Kan du visa mig hur man organiserar detta?,[sound:d7cb2207-ec3a-46b9-848c-5fefcf21dfb3.mp3],[sound:d3822dc9-3adc-4599-9c35-a3ab95edebe3.mp3],Can you show me how to organize this?,"<a href=""https://en.wiktionary.org/wiki/kan#Sw...","<img src=""15a3928a-a2bf-4911-9f96-a04ffd04b070...",Swedish,0.373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,1732024003622,Language Practice With Images++,,3,0.0,0,0.0,0,0.0,0.0,Jag är så glad för din skull,[sound:cb00e745-d723-4cdd-b63c-831b07b65c14.mp3],[sound:5a336bc0-e720-44b2-ba7e-ee1ec74bc42f.mp3],I'm so happy for you,"<a href=""https://en.wiktionary.org/wiki/jag#Sw...","<img src=""c1fbaea6-694f-4c71-abf1-e7115c2e7e0e...",Swedish,0.000
351,1732024003626,Language Practice With Images++,,3,0.0,0,0.0,0,0.0,0.0,Jag svarar snart på dina frågor,[sound:6f755ebe-31d2-4546-870d-4cde9b1ea7b8.mp3],[sound:c433d488-45ea-4185-ab30-bcda291bf880.mp3],I'll answer your questions soon,"<a href=""https://en.wiktionary.org/wiki/jag#Sw...","<img src=""0f11b583-c3af-4cd9-b2db-14add4f8ea10...",Swedish,0.000
353,1732024003634,Language Practice With Images++,,3,0.0,0,0.0,0,0.0,0.0,Har du övervägt alla dina alternativ?,[sound:8fd911c3-a7c1-4740-a47c-f7f74b2f7ded.mp3],[sound:a5005cff-614e-4d8a-888c-18bc6fbe249c.mp3],Have you considered all your options?,"<a href=""https://en.wiktionary.org/wiki/har#Sw...","<img src=""6a27e1e7-749e-4874-91ac-c4292ee63f17...",Swedish,0.000
354,1732024003638,Language Practice With Images++,,3,0.0,0,0.0,0,0.0,0.0,Är du orolig för ekonomin?,[sound:6a777f36-f9a5-471b-9ead-4395131666f6.mp3],[sound:996ecdbb-c233-4ecd-a784-0757ba94d88b.mp3],Are you worried about the economy?,"<a href=""https://en.wiktionary.org/wiki/%C3%A4...","<img src=""0cbc4ae9-ce87-40f7-adc3-f6a65d56d6bf...",Swedish,0.000
