In [1]:
from google.auth import default
credentials, project = default()

In [34]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils import load_json, save_text_file, load_text_file
from src.nlp import get_vocab_dictionary_from_phrases
from src.utils import get_longman_verb_vocab_dict, save_text_file, load_text_file
from src.phrase import generate_phrases_from_vocab_dict, generate_scenario_phrases, generate_scenario_vocab_building_phrases
from src.config_loader import config
from src.gcs_storage import get_phrase_index_path
config.TARGET_LANGUAGE_NAME

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'French'

# Flash Card Generation 01

## Generate english phrases

The core way we store vocabularly for generating phrases, and then flashcards, is in a dictionary with two keys. 'verbs' (for verbs in the infinitive form, like 'be', 'run') and 'vocab' (for everything else).

The intent is that a vocab list is a core learning requirement (e.g. for an exam), and that it is easier to remember words in the context of common phrases. i.e. learning the phrase 'I want', and separtely learning the noun 'cake' is less efficient than learning the phrase 'I want some cake, please'.

Even better if we link that phrase to an image and associated audio. This is the dual-encoding theory of langauge learning and leads to retention and recall benefits.

The first step is generating your english phrases from your vocab list

### Longman corpus

A common 'starter' corpus containing core words you should learn in terms of the 1st 1000 words, 2nd 1000 words etc

You can replace vocab_dict with any custom made python dictionary with 'verbs' and 'vocab' keys

In [24]:
from src.gcs_storage import get_phrase_path, read_from_gcs, get_story_collection_path, upload_to_gcs

story_collection = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_collection_path(collection="LM1000"))

In [8]:
#get first 200 phrases
all_phrases = []
for story_name in story_collection:
    all_phrases.extend([item['phrase'] for item in story_collection[story_name]])


In [18]:
all_vocab_dict = get_vocab_dictionary_from_phrases(all_phrases)

In [19]:
len(all_vocab_dict['verbs'])

279

In [None]:
# get warmUp vocab dict
first200_vocab_dict = get_vocab_dictionary_from_phrases(all_phrases[:150])

In [23]:
len(first200_vocab_dict['vocab'])

332

In [2]:


file_path = '../data/longman-communication-3000.json' # a specifc format
vocab_dict = get_longman_verb_vocab_dict(file_path, "S2") #S1 = 1st 1000 words used in Speech, W2 = 2nd 1000 words used in written etc

### Creating conversational phrases from a vocabulary dictionary

This function will iterate through (by sampling) the vocabularly dictionary, until it is exhausted.
We run a check against generated phrases so we can 'tick off' words already used.

Phrases are generated using an LLM

In [11]:
# or for GCSE vocab:
vocab_dict = load_json("..\data\gcse_vocab_list_cambridge.json")

In [29]:

#comment out the below two lines to go for the default of 6 - 9 word phrases and no more than 2 verbs
length_phrase = "4-5 words long, for beginner GCSE level, but treat common lexical chunks (I'm going to.., Do you.., Let us.. etc) as a single 'word'"
verbs_per_phrase = "one verb (but OK for an additional auxillary verb if necessary)"
localise = False # whether to tweak the prompt to set phrases within the target country
generated_phrases = generate_phrases_from_vocab_dict(   
    first200_vocab_dict, max_iterations=12,
      length_phrase=length_phrase,
        verbs_per_phrase=verbs_per_phrase,
        localise=localise)
#It takes about 15 iterations to go through 200 verbs, 800 vocab (1000 words total)
#You will end up with about 1000 phrases, so get practice of the same verb etc in different contexts

Iteration 1/12 - Generating 100 phrases
Function that called this one: generate_phrases_with_llm. Sleeping for 20 seconds
Generated 99 phrases
We have 78 verbs and 214 vocab words left
Iteration 2/12 - Generating 100 phrases
Function that called this one: generate_phrases_with_llm. Sleeping for 20 seconds
Generated 101 phrases
We have 6 verbs and 129 vocab words left
Iteration 3/12 - Using minimal phrase generation
Function that called this one: generate_minimal_phrases_with_llm. Sleeping for 20 seconds
Generated 50 phrases - with minimal phrase prompt
We have 1 verbs and 5 vocab words left
Iteration 4/12 - Using minimal phrase generation
Function that called this one: generate_minimal_phrases_with_llm. Sleeping for 20 seconds


Waiting for API cooldown: 100%|[34m██████████████[0m| 2/2 [00:02<00:00,  1.01s/it][0m


Generated 3 phrases - with minimal phrase prompt
We have 0 verbs and 0 vocab words left
All words have been used. Phrase generation complete. Generated 253 phrases.


In [30]:
generated_phrases

["Let's eat lunch tomorrow",
 'Can you play the guitar?',
 'I must finish my homework',
 "Don't wake me up early",
 'Do you like my hat?',
 'They arrived at the airport',
 "I can't find my coat",
 'Shall we go shopping today?',
 'The cake tastes delicious',
 'Could you help me, please?',
 "I'm going to the bank",
 'Did you see that film?',
 'We need to leave now',
 "Don't worry about the test",
 'Can I borrow your car?',
 "Let's have a party tonight",
 'I love your new dress',
 'Do you understand the question?',
 "They're getting married next month",
 "I'm trying to lose weight",
 "Don't forget your umbrella today",
 'Can you call me later?',
 'I hope you feel better',
 "Let's meet at the church",
 'Do you believe in ghosts?',
 "I can't stand the traffic",
 'We should grow more flowers',
 "Don't break my heart, please",
 'Can you explain this concept?',
 'I might go abroad soon',
 "Let's watch a movie tonight",
 'Do you know the way?',
 "I can't open this door",
 'We should try that re

In [38]:
COLLECTION = "WarmUp150"
upload_to_gcs(obj=generated_phrases, bucket_name = config.GCS_PRIVATE_BUCKET, file_name = get_phrase_path(collection = COLLECTION))

'gs://audio-language-trainer-private-content/collections/WarmUp150/phrases.json'

In [35]:
generated_phrases = read_from_gcs(config.GCS_PRIVATE_BUCKET,
                                  file_path=get_phrase_path(collection=COLLECTION))

In [40]:
len(set(generated_phrases))

249

In [41]:
from src.nlp import create_flashcard_index

WarmUp150_Index = create_flashcard_index(generated_phrases)

Indexing phrases...: 100%|██████████| 249/249 [06:36<00:00,  1.59s/it]


In [42]:
upload_to_gcs(obj=WarmUp150_Index, bucket_name=config.GCS_PRIVATE_BUCKET, file_name = get_phrase_index_path(collection=COLLECTION))

'gs://audio-language-trainer-private-content/collections/WarmUp150/index.json'

## Remove redundant phrases

In [5]:
from src.phrase import remove_phrases_with_no_new_words
old_list = load_text_file(phrase_dir / "longman_1000_phrases.txt")
new_list = load_text_file(phrase_dir / "longman_2000_phrases.txt")
new_list_2 = remove_phrases_with_no_new_words(known_phrases=old_list, new_phrases=new_list)

In [7]:
save_text_file(new_list_2, phrase_dir / "longman_2000_phrases.txt")

In [None]:
save_text_file(line=ordered_phrase_list, file_path = phrase_dir / "gcse_phrases.txt")

## Generate phrases and vocab for a scenario
Use an LLM to come up with typical phrases for a scenario

In [7]:
scenario = "meeting new swedish people - language learning community - talking about sweden - hiking, wild swimming, nature"


In [None]:

speaking_phrases = generate_scenario_phrases(scenario, num_phrases="20 - 25")


In [5]:

save_text_file(lines=speaking_phrases, file_path=phrase_dir / "swedish_language_learning.txt")

Bulk out this scenarios with some vocab

In [None]:
vocab_phrases = generate_scenario_vocab_building_phrases(scenario=scenario)
save_text_file(lines=vocab_phrases, file_path=phrase_dir / "swedish_lanuage_learning_vocab2.txt")