In [1]:
%load_ext autoreload
from google.auth import default
credentials, project = default()

In [2]:
%autoreload 2
from dotenv import load_dotenv
import sys
import os
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.convert import clean_filename
from src.gcs_storage import upload_to_gcs, get_phrase_path, get_story_names,read_from_gcs, get_phrase_to_story_index_path, get_story_collection_path, get_story_dialogue_path
from src.nlp import plot_vocabulary_growth
from collections import defaultdict
from src.gcs_storage import get_story_index_path
from src.nlp import create_story_index
from src.config_loader import config
COLLECTION = "LM2000"


## Sorting stories

We need to now get stories from the story collection path and refer to the phrase.json and index.json files in order to help sort them

In [3]:
all_story_names = get_story_names(collection=COLLECTION)
len(all_story_names)

28

In [4]:
# let's get all story dialogue


all_story_dialogue = defaultdict(dict)
for story_name in all_story_names:
    story_dialogue = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_dialogue_path(story_name, collection=COLLECTION))
    all_story_dialogue[story_name] = story_dialogue

print(all_story_dialogue.keys())


story_index = create_story_index(all_story_dialogue)


upload_to_gcs(story_index, config.GCS_PRIVATE_BUCKET, get_story_index_path(collection=COLLECTION))


dict_keys(['story_attic_mystery_box', 'story_backyard_garden_mystery', 'story_basketball_championship_dreams', 'story_beach_cleanup_challenge', 'story_christmas_tree_emergency', 'story_city_concert', 'story_cottage_vacation_mystery', 'story_factory_crisis_solution', 'story_hidden_treasure_discovery', 'story_hidden_treasure_mission', 'story_hospital_snow_storm', 'story_kitchen_disaster_drama', 'story_market_mystery_solved', 'story_midnight_library_mystery', 'story_midnight_mystery_escape', 'story_midnight_office_mystery', 'story_mount_everest_challenge', 'story_museum_crisis_decision', 'story_ocean_rescue_mission', 'story_swimming_pool_mystery', 'story_tropical_island_mystery', 'story_underground_city_mystery', 'story_underground_treasure_discovery', 'story_wedding_pizza_crisis', 'story_wedding_reception_chaos', 'story_wedding_reservation_crisis', 'story_weekend_museum_mystery', 'story_weekend_safari_adventure'])


Indexing stories...: 100%|██████████| 28/28 [20:00<00:00, 42.86s/it]


'gs://audio-language-trainer-private-content/collections/LM2000/story_index.json'

In [9]:
from src.gcs_storage import get_phrase_index_path, get_story_index_path

phrase_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_phrase_index_path(collection=COLLECTION))
story_index = read_from_gcs(config.GCS_PRIVATE_BUCKET, get_story_index_path(collection=COLLECTION))



In [10]:
from src.nlp import assign_phrases_to_stories
assignments = assign_phrases_to_stories(
    phrase_index=phrase_index,
    story_index=story_index,
    max_phrases_per_story=60,
)


Optimized Story Sequence:
------------------------------------------------------------
Story                          Score      Verbs      Vocab     
------------------------------------------------------------
story_midnight_library_mystery 202        39         124       
story_beach_cleanup_challenge  214        45         124       
story_kitchen_disaster_drama   214        47         120       
story_midnight_mystery_escape  215        47         121       
story_swimming_pool_mystery    222        48         126       
story_christmas_tree_emergency 227        52         123       
story_midnight_office_mystery  229        49         131       
story_underground_city_mystery 229        56         117       
story_factory_crisis_solution  231        43         145       
story_weekend_museum_mystery   231        51         129       
story_wedding_reception_chaos  232        52         128       
story_attic_mystery_box        236        44         148       
story_city_concert 

In [None]:
assignments

{'story_mysterious_wedding_gift': [{'phrase': "I can't figure it out",
   'score': 24.0,
   'new_story_verbs': 2,
   'new_story_vocab': 4,
   'new_global_verbs': 2,
   'new_global_vocab': 4,
   'total_new_words': 6},
  {'phrase': 'Do you know the way?',
   'score': 20.0,
   'new_story_verbs': 2,
   'new_story_vocab': 2,
   'new_global_verbs': 2,
   'new_global_vocab': 3,
   'total_new_words': 5},
  {'phrase': "Let's have tea this afternoon",
   'score': 20.0,
   'new_story_verbs': 2,
   'new_story_vocab': 2,
   'new_global_verbs': 2,
   'new_global_vocab': 4,
   'total_new_words': 6},
  {'phrase': "I'm trying to lose weight",
   'score': 19.0,
   'new_story_verbs': 2,
   'new_story_vocab': 1,
   'new_global_verbs': 3,
   'new_global_vocab': 2,
   'total_new_words': 5},
  {'phrase': 'I might get a new job',
   'score': 18.0,
   'new_story_verbs': 2,
   'new_story_vocab': 1,
   'new_global_verbs': 2,
   'new_global_vocab': 3,
   'total_new_words': 5},
  {'phrase': 'Look at that beautiful

In [11]:
from src.nlp import plot_vocabulary_growth_from_assignments
plot_vocabulary_growth_from_assignments(assignments)

In [12]:
from src.gcs_storage import get_story_collection_path
upload_to_gcs(obj = assignments,
            bucket_name=config.GCS_PRIVATE_BUCKET,
            file_name = get_story_collection_path(collection=COLLECTION))

'gs://audio-language-trainer-private-content/collections/LM2000/LM2000.json'