In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils import load_json, save_text_file  # noqa: E402

src_dir = Path().absolute().parent
phrase_dir = src_dir / "data" / "phrases"
PAY_FOR_API = True #change to True to run cells that cost money via API calls

setting voice override: it-IT-Wavenet-E
setting voice override: it-IT-Wavenet-F


# Flash Card Generation 01

## Generate english phrases

The core way we store vocabularly for generating phrases, and then flashcards, is in a dictionary with two keys. 'verbs' (for verbs in the infinitive form, like 'be', 'run') and 'vocab' (for everything else).

The intent is that a vocab list is a core learning requirement (e.g. for an exam), and that it is easier to remember words in the context of common phrases. i.e. learning the phrase 'I want', and separtely learning the noun 'cake' is less efficient than learning the phrase 'I want some cake, please'.

Even better if we link that phrase to an image and associated audio. This is the dual-encoding theory of langauge learning and leads to retention and recall benefits.

The first step is generating your english phrases from your vocab list

### Longman corpus

A common 'starter' corpus containing core words you should learn in terms of the 1st 1000 words, 2nd 1000 words etc

You can replace vocab_dict with any custom made python dictionary with 'verbs' and 'vocab' keys

In [2]:
from src.utils import get_longman_verb_vocab_dict
from src.phrase import generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases

file_path = '../data/longman-communication-3000.json' # a specifc format
vocab_dict = get_longman_verb_vocab_dict(file_path, "S2") #S1 = 1st 1000 words used in Speech, W2 = 2nd 1000 words used in written etc

In [3]:
print(f" first 10 verbs: {vocab_dict['verbs'][:10]}, \nand first 10 other words: {vocab_dict['vocab'][:10]}")

 first 10 verbs: ['achieve', 'act', 'address', 'admit', 'advise', 'affect', 'aim', 'announce', 'apologize', 'appear'], 
and first 10 other words: ['ability', 'abuse', 'access', 'accident', 'accommodation', 'activity', 'address', 'administration', 'adult', 'advance']


### Creating conversational phrases from a vocabulary dictionary

This function will iterate through (by sampling) the vocabularly dictionary, until it is exhausted.
We run a check against generated phrases so we can 'tick off' words already used.

Phrases are generated using an LLM

In [11]:
# or for GCSE vocab:
vocab_dict = load_json("..\data\gcse_vocab_list_cambridge.json")

In [None]:
if PAY_FOR_API:
    #comment out the below two lines to go for the default of 6 - 9 word phrases and no more than 2 verbs
    length_phrase = "4-5 words long, for beginner GCSE level, but treat common lexical chunks (I'm going to.., Do you.., Let us.. etc) as a single 'word'"
    verbs_per_phrase = "one verb (but OK for an additional auxillary verb if necessary)"
    localise = False # whether to tweak the prompt to set phrases within the target country
    generated_phrases = generate_phrases_from_vocab_dict(   
        vocab_dict, max_iterations=1,
         length_phrase=length_phrase,
           verbs_per_phrase=verbs_per_phrase,
           localise=localise)
    #It takes about 15 iterations to go through 200 verbs, 800 vocab (1000 words total)
    #You will end up with about 1000 phrases, so get practice of the same verb etc in different contexts

Config file has been modified. Reloading...
Function that called this one: generate_phrases_with_llm. Sleeping for 20 seconds
Config file has been modified. Reloading...
Iteration 1/1
Generated 99 phrases
We have 128 verbs and 551 vocab words left
Reached maximum number of iterations (1). Stopping phrase generation.


In [None]:
# optimise the phrase ordering
from src.nlp import optimise_phrase_list, plot_vocabulary_growth

ordered_phrase_list = optimise_phrase_list(generated_phrases, window_size=21)

In [None]:
plot_vocabulary_growth(ordered_phrase_list)

In [None]:
save_text_file(line=ordered_phrase_list, file_path = phrase_dir / "gcse_phrases.txt")

## Generate phrases and vocab for a scenario
Use an LLM to come up with typical phrases for a scenario

In [7]:
scenario = "meeting new swedish people - language learning community - talking about sweden - hiking, wild swimming, nature"


In [None]:

speaking_phrases = generate_scenario_phrases(scenario, num_phrases="20 - 25")


In [5]:

save_text_file(lines=speaking_phrases, file_path=phrase_dir / "swedish_language_learning.txt")

Bulk out this scenarios with some vocab

In [8]:
vocab_phrases = generate_scenario_vocab_building_phrases(scenario=scenario)
save_text_file(lines=vocab_phrases, file_path=phrase_dir / "swedish_lanuage_learning_vocab2.txt")

Function that called this one: generate_scenario_vocab_building_phrases. Sleeping for 20 seconds
