In [1]:
%load_ext autoreload
%autoreload 2

from setup_imports import *  # noqa: F401,F403

from src.phrases.generation import generate_phrases_from_vocab_dict
from src.utils import (
    save_text_file,
    load_text_file,
)
from src.phrases.phrase_model import Phrase
from src.phrases.search import get_vocab_from_phrases

# Phrase Generation

We will have short, simple verb phrases, and associated vocab-only phrases from a vocab_dict. The deliberate removal of verbs from vocab phrases is designed to make the cards easier, and split those tasks

In [2]:
COLLECTION = "LM1000"
DECK = "Pack06"

In [5]:
all_verbs = load_text_file(f"../data/{COLLECTION}_verbs.txt")
all_vocab = load_text_file(f"../data/{COLLECTION}_vocab.txt")
print(f"num verbs: {len(all_verbs)}, num vocab: {len(all_vocab)}")

num verbs: 150, num vocab: 365


In [6]:
FROM_INDEX = 0  # <--- we start by 0 now as we remove used up words
TO_INDEX = FROM_INDEX + 10
VOCAB_FROM_INDEX = int((FROM_INDEX / 10) * 50)
VOCAB_TO_INDEX = VOCAB_FROM_INDEX + 50

some_verbs = all_verbs[FROM_INDEX:TO_INDEX]
some_vocab = all_vocab[VOCAB_FROM_INDEX:VOCAB_TO_INDEX]

current_dict = {}
current_dict["verbs"] = list(some_verbs)
current_dict["vocab"] = list(some_vocab)

print(
    f"FROM_INDEX: {FROM_INDEX}, TO_INDEX: {TO_INDEX}, VOCAB_FROM_INDEX: {VOCAB_FROM_INDEX}, VOCAB_TO_INDEX: {VOCAB_TO_INDEX}, len verbs: {len(some_verbs)}, len vocab: {len(some_vocab)}"
)
print(current_dict)

FROM_INDEX: 0, TO_INDEX: 10, VOCAB_FROM_INDEX: 0, VOCAB_TO_INDEX: 50, len verbs: 10, len vocab: 50
{'verbs': ['miss', 'grow', 'shut', 'suffer', 'happen', 'drop', 'start', 'get', 'draw', 'prepare'], 'vocab': ['class', 'football', 'bottle', 'interest', 'someone', 'start', 'wrong', 'share', 'just', 'parent', 'quarter', 'minister', 'town', 'occasion', 'somewhere', 'community', 'road', 'normal', 'presumably', 'until', 'rid', 'could', 'north', 'brother', 'proper', 'afternoon', 'nearly', 'weekend', 'almost', 'fairly', 'birthday', 'slightly', 'onto', 'under', 'up', 'example', 'excellent', 'poor', 'check', 'perhaps', 'board', 'most', 'opinion', 'might', 'no one', 'success', 'leg', 'past', 'any', 'environment']}


In [7]:
generated_phrases = generate_phrases_from_vocab_dict(
    current_dict,
    max_iterations=20,
)

2026-02-14 20:52:38 - audio-language-trainer - INFO - generation.py:65 - Starting verb phrase generation. 10 verbs to process.
2026-02-14 20:52:38 - audio-language-trainer - INFO - generation.py:111 -   [1/10] Generating phrases for verb: 'miss'
2026-02-14 20:52:43 - audio-language-trainer - INFO - generation.py:111 -   [2/10] Generating phrases for verb: 'grow'
2026-02-14 20:52:48 - audio-language-trainer - INFO - generation.py:111 -   [3/10] Generating phrases for verb: 'shut'
2026-02-14 20:52:51 - audio-language-trainer - INFO - generation.py:111 -   [4/10] Generating phrases for verb: 'suffer'
2026-02-14 20:52:55 - audio-language-trainer - INFO - generation.py:111 -   [5/10] Generating phrases for verb: 'happen'
2026-02-14 20:52:58 - audio-language-trainer - INFO - generation.py:111 -   [6/10] Generating phrases for verb: 'drop'
2026-02-14 20:53:03 - audio-language-trainer - INFO - generation.py:111 -   [7/10] Generating phrases for verb: 'start'
2026-02-14 20:53:07 - audio-languag

In [8]:
save_text_file(generated_phrases[0], f"../data/{COLLECTION}-{DECK}.txt")

In [9]:
generated_phrases = load_text_file(f"../data/{COLLECTION}-{DECK}.txt")
# generated_phrases = load_text_file(f"..\data\phrases\survival.txt")

In [10]:
len(generated_phrases)

73

In [None]:
from src.phrases.search import get_phrases_by_collection


ALL_PHRASES = get_phrases_by_collection(COLLECTION, DECK)

In [11]:
ALL_PHRASES = []
for phrase in generated_phrases:
    try:
        p = Phrase.create(phrase)
    except ValueError as e:
        print(e)
        continue
    p.collection = COLLECTION
    p.deck = DECK
    #p.generate_image()
    #p.translate("sv-SE", refine=True)
    p.upload()
    #p.generate_audio(context="flashcard", language="sv-SE", split_on_space=True)
    #p.upload()
    ALL_PHRASES.append(p)

2026-02-14 21:42:27 - audio-language-trainer - INFO - phrase_model.py:371 - Uploading phrase i_miss_my_family_d1823a with all translations to Firestore and GCS
2026-02-14 21:42:27 - audio-language-trainer - INFO - phrase_model.py:1017 - Uploading all multimedia for en-GB translation
2026-02-14 21:42:29 - audio-language-trainer - INFO - phrase_model.py:371 - Uploading phrase did_you_miss_the_bus_843057 with all translations to Firestore and GCS
2026-02-14 21:42:29 - audio-language-trainer - INFO - phrase_model.py:1017 - Uploading all multimedia for en-GB translation
2026-02-14 21:42:31 - audio-language-trainer - INFO - phrase_model.py:371 - Uploading phrase she_will_miss_the_meeting_f78838 with all translations to Firestore and GCS
2026-02-14 21:42:31 - audio-language-trainer - INFO - phrase_model.py:1017 - Uploading all multimedia for en-GB translation
2026-02-14 21:42:32 - audio-language-trainer - INFO - phrase_model.py:371 - Uploading phrase he_missed_the_target_997b44 with all trans

## Remove words from original list

In [12]:
# _phrases = get_phrases_by_collection(COLLECION, DECK)
# _verbs = get_verbs_from_phrases(ALL_PHRASES)
_vocab = get_vocab_from_phrases(ALL_PHRASES)
_tokens = [t for p in ALL_PHRASES for t in p.tokens]
print(f"num vocab: {len(_vocab)}, {_vocab}")

num vocab: 147, ["'", 'a', 'about', 'afternoon', 'allergy', 'an', 'anniversary', 'approach', 'available', 'away', 'bench', 'birthday', 'board', 'book', 'bottle', 'breakfast', 'brother', 'bus', 'cake', 'car', 'check', 'class', 'clean', 'community', 'company', 'conclusion', 'consequence', 'construction', 'curtain', 'daily', 'day', 'desk', 'different', 'dinner', 'door', 'early', 'edge', 'environment', 'every', 'exam', 'example', 'excellent', 'fair', 'fairly', 'family', 'football', 'for', 'fresh', 'from', 'fruit', 'glass', 'good', 'he', 'her', 'home', 'i', 'in', 'injury', 'interest', 'it', 'just', 'late', 'later', 'leg', 'loss', 'me', 'meeting', 'memory', 'message', 'milk', 'minister', 'most', 'my', "n't", 'nearly', 'next', 'no', 'normal', 'north', 'not', 'now', 'occasion', 'of', 'off', 'on', 'one', 'opinion', 'opportunity', 'parent', 'party', 'past', 'perfect', 'perhaps', 'phone', 'picture', 'plant', 'please', 'poor', 'presumably', 'proper', 'quarter', 'quickly', 'report', 'road', 'routin

In [13]:
remaining_verbs = set(all_verbs).difference(
    some_verbs
)  # we don't extract these as we know for sure they are all used
remaining_vocab = set(all_vocab) - set(_vocab) - set(_tokens)

print(f"num verbs: {len(remaining_verbs)}, num vocab: {len(remaining_vocab)}")

num verbs: 140, num vocab: 302


In [14]:
save_text_file(remaining_verbs, f"../data/{COLLECTION}_verbs.txt")
save_text_file(remaining_vocab, f"../data/{COLLECTION}_vocab.txt")

# create bespoke phrases

In [None]:
new_phrases = ["Here you go"]

In [None]:
for p in new_phrases:
    phrase = Phrase.create(p)
    phrase.collection = COLLECTION
    phrase.deck = DECK
    phrase.generate_image()
    phrase.translate("de-DE", refine=True)
    phrase.upload()
    phrase.generate_audio(context="flashcard", language="de-DE")
    phrase.upload()