In [None]:
%load_ext autoreload
%autoreload 2

from setup_imports import *  # noqa: F401,F403

from src.phrases.generation import generate_phrases_from_vocab_dict
from src.utils import (
    load_json,
    save_text_file,
    get_longman_verb_vocab_dict,
    load_text_file,
)
from src.phrases.phrase_model import Phrase
from src.phrases.search import get_verbs_from_phrases, get_vocab_from_phrases

# Phrase Generation

We will have short, simple verb phrases, and associated vocab-only phrases from a vocab_dict. The deliberate removal of verbs from vocab phrases is designed to make the cards easier, and split those tasks

In [None]:
vocab_dict = load_json("..\data\gcse_vocab_list_cambridge.json")

In [None]:
first2000 = get_longman_verb_vocab_dict(
    "..\data\longman-communication-3000.json", category="S2"
)

In [None]:
# let's create then shuffle-up verbs and vocab

# from random import shuffle


# all_verbs = (first2000['verbs'])
# all_vocab = (first2000['vocab'])
# shuffle(all_verbs)
# shuffle(all_vocab)

In [None]:
# save_text_file(all_verbs, "../data/LM2000_verbs.txt")
# save_text_file(all_vocab, "../data/LM2000_vocab.txt")



In [None]:
all_verbs = load_text_file("../data/LM2000_verbs.txt")
all_vocab = load_text_file("../data/LM2000_vocab.txt")
print(f"num verbs: {len(all_verbs)}, num vocab: {len(all_vocab)}")

In [None]:
FROM_INDEX = 0  # <--- we start by 0 now as we remove used up words
TO_INDEX = FROM_INDEX + 10
VOCAB_FROM_INDEX = int((FROM_INDEX / 10) * 50)
VOCAB_TO_INDEX = VOCAB_FROM_INDEX + 50


COLLECTION = "LM2000"
DECK = "Pack04"
some_verbs = all_verbs[FROM_INDEX:TO_INDEX]
some_vocab = all_vocab[VOCAB_FROM_INDEX:VOCAB_TO_INDEX]

current_dict = {}
current_dict["verbs"] = list(some_verbs)
current_dict["vocab"] = list(some_vocab)

print(
    f"FROM_INDEX: {FROM_INDEX}, TO_INDEX: {TO_INDEX}, VOCAB_FROM_INDEX: {VOCAB_FROM_INDEX}, VOCAB_TO_INDEX: {VOCAB_TO_INDEX}, len verbs: {len(some_verbs)}, len vocab: {len(some_vocab)}"
)
print(current_dict)

In [None]:
generated_phrases = generate_phrases_from_vocab_dict(
    current_dict,
    max_iterations=20,
)

In [None]:
generated_phrases[0]

In [None]:
save_text_file(generated_phrases[0], f"..\data\LM2000-{FROM_INDEX}-{TO_INDEX}.txt")

In [None]:
generated_phrases = load_text_file(f"..\data\LM2000-{FROM_INDEX}-{TO_INDEX}.txt")

In [None]:
COLLECTION = "LM2000"
DECK = "Pack04"

In [None]:
ALL_PHRASES = []
for phrase in generated_phrases:
    try:
        p = Phrase.create(phrase)
    except ValueError as e:
        print(e)
        continue
    p.collection = COLLECTION
    p.deck = DECK
    p.generate_image()
    p.translate("sv-SE", refine=True)
    p.upload()
    p.generate_audio(context="flashcard", language="sv-SE")
    p.upload()
    ALL_PHRASES.append(p)

## Remove words from original list

In [None]:
# COLLECION = "LM2000"
# DECK = "Pack04"

# _phrases = get_phrases_by_collection(COLLECION, DECK)
_verbs = get_verbs_from_phrases(ALL_PHRASES)
_vocab = get_vocab_from_phrases(ALL_PHRASES)
print(f"num verbs: {_verbs}, num vocab: {_vocab}")

In [None]:
remaining_verbs = set(all_verbs).difference(_verbs)
remaining_vocab = set(all_vocab).difference(_vocab)

print(f"num verbs: {len(remaining_verbs)}, num vocab: {len(remaining_vocab)}")

In [None]:
save_text_file(remaining_verbs, "../data/LM2000_verbs.txt")
save_text_file(remaining_vocab, "../data/LM2000_vocab.txt")