In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import sys
import os
import pickle
load_dotenv()
# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.config_loader import config, VoiceManager
from src.phrase import generate_phrases_with_llm, generate_phrases_from_vocab_dict
from src.utils import load_json
from src.anki_tools import convert_anki_to_story_dict, AnkiCollectionReader, export_to_anki_with_images
from src.utils import load_text_file, save_json, load_json
from src.dialogue_generation import get_story_prompt, generate_story
from src.config_loader import config, VoiceManager, VoiceInfo, VoiceType, VoiceProvider
from pprint import pprint
import random
import os

from langcodes import Language

ANKI_PATH = "C:/Users/i5/AppData/Roaming/Anki2/User 1/collection.anki2"

FFmpeg path added to system PATH: C:\Program Files\ffmpeg-7.0-essentials_build\bin


In [8]:
from src.utils import load_text_file, save_json, load_json
from src.nlp import get_vocab_dictionary_from_phrases, get_vocab_dict_from_dialogue, create_flashcard_index
from src.config_loader import config
from pprint import pprint
import random

filepath = "../data/longman_1000_phrases.txt"
phrases = load_text_file(filepath)
pprint(f"First few phrases {phrases[:10]}")

("First few phrases ['Do you want to become a famous writer?', 'Let me show "
 "you around the city', 'We need to handle this situation carefully', 'Stop "
 'wasting time on this\', \'Do you like playing the guitar at night?\', "I\'m '
 'taking a vacation next month", "Don\'t forget to wear a helmet while '
 'cycling", "Let\'s cut unnecessary expenses this year", "We\'re producing a '
 'new product soon", \'Did you remember to turn off the stove?\']')


In [None]:
from src.utils import plot_vocabulary_growth

# Example usage

plot_vocabulary_growth(phrases, window=10)  # Using smaller window for this example

In [16]:
import spacy
import pandas as pd
import numpy as np
from typing import List, Set, Dict, Tuple
from tqdm import tqdm

def prepare_phrase_dataframe(phrases: List[str]) -> pd.DataFrame:
    """Create DataFrame with parsed phrases and extract content words.
    
    Args:
        phrases: List of phrases to analyze
        
    Returns:
        DataFrame with columns:
            - phrase: Original phrase
            - doc: Spacy Doc object
            - content_words: Set of lemmatized content words
    """
    nlp = spacy.load('en_core_web_sm')
    
    # Create base dataframe
    df = pd.DataFrame({'phrase': phrases})
    
    # Parse phrases
    df['doc'] = [nlp(phrase.lower()) for phrase in tqdm(phrases, desc='Parsing phrases')]
    
    # Extract content words
    df['content_words'] = df['doc'].apply(
        lambda doc: {token.lemma_ for token in doc 
                    if not token.is_stop and token.pos_ != 'PUNCT'}
    )
    
    # Calculate total words per phrase
    df['total_words'] = df['content_words'].apply(len)
    
    return df

def calculate_new_words(row: pd.Series, known_vocab: Set[str]) -> Dict:
    """Calculate new vocabulary metrics for a row."""
    new_words = row['content_words'] - known_vocab
    return {
        'new_words': len(new_words),
        'new_vocab': new_words,
        'new_ratio': len(new_words) / row['total_words'] if row['total_words'] > 0 else 0
    }

def optimize_sequence(df: pd.DataFrame, window_size: int = 5) -> pd.DataFrame:
    """Optimize the sequence of phrases for steady vocabulary acquisition.
    
    Args:
        df: DataFrame with parsed phrases (from prepare_phrase_dataframe)
        window_size: Size of rolling window for local optimization
        
    Returns:
        DataFrame with optimized sequence and metrics
    """
    # Calculate ideal rate
    total_vocab = set().union(*df['content_words'].values)
    ideal_rate = len(total_vocab) / len(df)
    
    # Initialize result storage
    result_indices = []
    known_vocab = set()
    available_indices = set(df.index)
    
    # Choose initial phrase closest to ideal rate
    initial_metrics = df.apply(
        lambda row: calculate_new_words(row, known_vocab), 
        axis=1
    ).apply(pd.Series)
    
    best_start_idx = (initial_metrics['new_ratio'] - ideal_rate).abs().idxmin()
    result_indices.append(best_start_idx)
    available_indices.remove(best_start_idx)
    known_vocab.update(df.loc[best_start_idx, 'content_words'])
    
    # Build rest of sequence
    with tqdm(total=len(df)-1, desc='Optimizing sequence') as pbar:
        while available_indices:
            # Calculate rolling mean of recent phrases
            if len(result_indices) >= window_size:
                recent_indices = result_indices[-window_size:]
                recent_new_words = np.mean([
                    len(df.loc[idx, 'content_words'] - known_vocab)
                    for idx in recent_indices
                ])
            else:
                recent_new_words = ideal_rate
            
            # Score remaining phrases
            candidates = []
            for idx in available_indices:
                metrics = calculate_new_words(df.loc[idx], known_vocab)
                score = abs(metrics['new_words'] - ideal_rate)
                # Add penalty for big deviations from recent average
                local_penalty = abs(metrics['new_words'] - recent_new_words)
                score += local_penalty * 0.5  # Weight local smoothness
                candidates.append((idx, metrics, score))
            
            # Select best candidate
            best_idx = min(candidates, key=lambda x: x[2])[0]
            result_indices.append(best_idx)
            available_indices.remove(best_idx)
            known_vocab.update(df.loc[best_idx, 'content_words'])
            pbar.update(1)
    
    # Create result DataFrame
    result_df = df.loc[result_indices].copy()
    result_df['sequence_position'] = range(len(result_df))
    
    # Calculate final metrics
    metrics = []
    known = set()
    for _, row in result_df.iterrows():
        m = calculate_new_words(row, known)
        metrics.append(m)
        known.update(row['content_words'])
    
    metrics_df = pd.DataFrame(metrics)
    result_df = pd.concat([result_df, metrics_df], axis=1)
    
    return result_df

def analyze_sequence(df: pd.DataFrame) -> Dict:
    """Analyze and print statistics about the optimized sequence."""
    stats = {
        'avg_new_words': df['new_words'].mean(),
        'std_new_words': df['new_words'].std(),
        'min_new_words': df['new_words'].min(),
        'max_new_words': df['new_words'].max(),
        'total_phrases': len(df),
        'cumulative_vocab': len(set().union(*df['content_words'].values))
    }
    
    print("\nSequence Analysis:")
    print(f"Total phrases: {stats['total_phrases']}")
    print(f"Total vocabulary: {stats['cumulative_vocab']}")
    print(f"Average new words per phrase: {stats['avg_new_words']:.2f}")
    print(f"Standard deviation: {stats['std_new_words']:.2f}")
    print(f"Min new words: {stats['min_new_words']}")
    print(f"Max new words: {stats['max_new_words']}")
    
    return stats

def optimize_vocab_sequence(phrases: List[str], window_size: int = 5) -> Tuple[pd.DataFrame, Dict]:
    """Main function to optimize phrase sequence for vocabulary acquisition.
    
    Args:
        phrases: List of phrases to optimize
        window_size: Size of rolling window for local optimization
        
    Returns:
        Tuple of (optimized DataFrame, statistics dictionary)
    """
    # Prepare data
    df = prepare_phrase_dataframe(phrases)
    
    # Optimize sequence
    optimized_df = optimize_sequence(df, window_size)
    
    # Analyze results
    stats = analyze_sequence(optimized_df)
    
    return optimized_df, stats

# Example usage:
def test_optimizer(phrases):
    print("Optimizing phrase sequence...")
    df, stats = optimize_vocab_sequence(phrases)
    
    print("\nOptimized Sequence:")
    for _, row in df.iterrows():
        print(f"{row['sequence_position'] + 1}. {row['phrase']} (New words: {row['new_words']})")
    
    return df, stats

In [18]:
# Get optimized sequence and statistics
optimized_df, stats = optimize_vocab_sequence(phrases)



Parsing phrases: 100%|██████████| 841/841 [00:04<00:00, 191.83it/s]
Optimizing sequence: 100%|██████████| 840/840 [00:17<00:00, 49.19it/s] 


Sequence Analysis:
Total phrases: 841
Total vocabulary: 1068
Average new words per phrase: 1.27
Standard deviation: 0.61
Min new words: 0
Max new words: 4





In [21]:
new_arrangement = optimized_df['phrase'].tolist()

In [22]:
plot_vocabulary_growth(new_arrangement)

In [23]:
new_arrangement

['Do you want to become a famous writer?',
 'Should we move to another location?',
 'Shall we move on to something else?',
 "I'll be at the event myself.",
 'Shall we think of something else?',
 "Don't you think being together is what matters most?",
 'Still, I think we should give it a try.',
 'Have you already thought about this a lot?',
 'I tried to catch up with the others',
 'Can you show me how it works?',
 "She'll get that phone call from work",
 'Shall we try that restaurant?',
 'Can you show me how to organize this?',
 'Did they really see a UFO?',
 "Shouldn't we mention this to the others?",
 'Just say what you mean',
 "We can't make any mistakes now",
 'We all make typing mistakes - even me!',
 "I'm so happy for you",
 'Make your own choice in this matter',
 "I feel happy when I'm with you",
 'I think we should leave now',
 'He might charge his phone before leaving',
 'I meant to call but forgot',
 'He might mention your name in the report',
 'I just want to feel happy and c

In [None]:
save_json(test_vocab_dict, "test_dict.json")

In [56]:
test_story = generate_story(test_vocab_dict)

Function that called this one: generate_story. Sleeping for 20 seconds


In [57]:
save_json(test_story, "test_story_50_phrases.json")

In [58]:
vocab_used = get_vocab_dict_from_dialogue(test_story)

In [67]:
from src.generate import add_translations


test_story = add_translations(test_story)

adding translations:   0%|          | 0/3 [00:00<?, ?it/s]

Beginning translation for introduction
Config file has been modified. Reloading...


adding translations:  33%|███▎      | 1/3 [00:01<00:03,  1.73s/it]

Translated dialogue
Beginning translation for development


adding translations:  67%|██████▋   | 2/3 [00:03<00:01,  1.65s/it]

Translated dialogue
Beginning translation for resolution


adding translations: 100%|██████████| 3/3 [00:04<00:00,  1.65s/it]

Translated dialogue





In [71]:
from src.generate import add_audio

test_story_audio = add_audio(test_story)

adding audio:   0%|          | 0/3 [00:00<?, ?it/s]

Beginning text-to-speech for introduction


Generating dialogue audio: 100%|██████████| 16/16 [02:25<00:00,  9.06s/it]
adding audio:  33%|███▎      | 1/3 [02:41<05:23, 161.63s/it]

Text-to-speech for dialogue done
Beginning text-to-speech for development


Generating dialogue audio: 100%|██████████| 19/19 [02:50<00:00,  8.96s/it]
adding audio:  67%|██████▋   | 2/3 [05:33<02:47, 167.57s/it]

Text-to-speech for dialogue done
Beginning text-to-speech for resolution


Generating dialogue audio: 100%|██████████| 18/18 [02:40<00:00,  8.93s/it]
adding audio: 100%|██████████| 3/3 [08:15<00:00, 165.18s/it]

Text-to-speech for dialogue done





In [75]:
from src.utils import create_html_story

output_dir = "../outputs/test/test.html"
create_html_story(
            test_story_audio,
            output_dir,
            component_path="../src/StoryViewer.js",
            title="test_short_story",
        )

HTML story created at: ../outputs/test/test.html


In [59]:
compare_vocab_overlap(test_vocab_dict, vocab_used)

=== VERB ANALYSIS ===
Original verbs: 74
Verbs used in story: 67
Verbs from original used: 32 (43.2%)
New verbs introduced: 35
Examples of new verbs: ['look', 'delay', 'come', 'hope', 'ride']

=== VOCABULARY ANALYSIS ===
Original vocabulary: 153
Vocabulary used in story: 180
Vocabulary from original used: 66 (43.1%)
New vocabulary introduced: 114
Examples of new vocabulary: ['proud', 'able', 'worth', 'and', 'empty']


{'verb_overlap': {'be',
  'bring',
  'build',
  'can',
  'do',
  'enjoy',
  'excuse',
  'find',
  'forget',
  'get',
  'go',
  'have',
  'hear',
  'let',
  'like',
  'love',
  'might',
  'organize',
  'pay',
  'plan',
  'play',
  'remember',
  'see',
  'should',
  'show',
  'spend',
  'support',
  'thank',
  'think',
  'want',
  'wear',
  'will'},
 'new_verbs': {"'ve",
  'accomplish',
  'ask',
  'believe',
  'brainstorm',
  'check',
  'come',
  'could',
  'create',
  'cycle',
  'delay',
  'discuss',
  'eat',
  'expect',
  'give',
  'happen',
  'help',
  'hope',
  'involve',
  'know',
  'lead',
  'look',
  'maintain',
  'meet',
  'plant',
  'ride',
  'say',
  'sound',
  'speak',
  'start',
  'suppose',
  'take',
  'turn',
  'worry',
  'would'},
 'unused_verbs': {'accept',
  'answer',
  'apply',
  'bet',
  'call',
  'carry',
  'choose',
  'collect',
  'cook',
  'fit',
  'join',
  'keep',
  'learn',
  'leave',
  'lift',
  'live',
  'lose',
  'move',
  'need',
  'offer',
  'open',
  'produ

In [None]:
# Create index once - to retrieve matching flashcards that already exist
from src.nlp import create_flashcard_index, get_matching_flashcards_indexed

flashcard_index = create_flashcard_index(phrases)


10it [00:13,  1.36s/it]


In [None]:
save_json(flashcard_index, "test_flashcard_index.json")


In [60]:

# Use indexed version for faster matching
results = get_matching_flashcards_indexed(vocab_used, flashcard_index)

verb matches: 100%|██████████| 67/67 [00:00<?, ?it/s]
vocab matches: 100%|██████████| 180/180 [00:00<?, ?it/s]
ranking cards: 100%|██████████| 809/809 [00:00<00:00, 134869.90it/s]
verb matches: 100%|██████████| 64/64 [00:00<00:00, 63897.99it/s]
vocab matches: 100%|██████████| 173/173 [00:00<?, ?it/s]
ranking cards: 100%|██████████| 788/788 [00:00<00:00, 196979.05it/s]
verb matches: 100%|██████████| 60/60 [00:00<?, ?it/s]
vocab matches: 100%|██████████| 168/168 [00:00<00:00, 168012.18it/s]
ranking cards: 100%|██████████| 771/771 [00:00<00:00, 154270.03it/s]
verb matches: 100%|██████████| 59/59 [00:00<?, ?it/s]
vocab matches: 100%|██████████| 162/162 [00:00<00:00, 162398.96it/s]
ranking cards: 100%|██████████| 762/762 [00:00<00:00, 189194.32it/s]
verb matches: 100%|██████████| 57/57 [00:00<?, ?it/s]
vocab matches: 100%|██████████| 157/157 [00:00<00:00, 156861.77it/s]
ranking cards: 100%|██████████| 740/740 [00:00<00:00, 185145.85it/s]
verb matches: 100%|██████████| 55/55 [00:00<?, ?it/s]

In [65]:
proposed_flashcard_phrases = [card.get('phrase') for card in results['selected_cards']]
vocab_from_flashcards = get_vocab_dictionary_from_phrases(proposed_flashcard_phrases)

In [66]:
compare_vocab_overlap(vocab_used, vocab_from_flashcards)

=== VERB ANALYSIS ===
Original verbs: 67
Verbs used in story: 103
Verbs from original used: 59 (88.1%)
New verbs introduced: 44
Examples of new verbs: ['listen', 'provide', 'write', 'push', 'feed']

=== VOCABULARY ANALYSIS ===
Original vocabulary: 180
Vocabulary used in story: 274
Vocabulary from original used: 148 (82.2%)
New vocabulary introduced: 126
Examples of new vocabulary: ['problem', 'solution', 'noise', 'sorry', 'food']


{'verb_overlap': {"'ve",
  'ask',
  'be',
  'believe',
  'bring',
  'build',
  'can',
  'check',
  'come',
  'could',
  'discuss',
  'do',
  'eat',
  'enjoy',
  'excuse',
  'expect',
  'find',
  'forget',
  'get',
  'give',
  'go',
  'happen',
  'have',
  'hear',
  'help',
  'hope',
  'know',
  'lead',
  'let',
  'like',
  'look',
  'love',
  'maintain',
  'meet',
  'might',
  'organize',
  'pay',
  'plan',
  'play',
  'remember',
  'say',
  'see',
  'should',
  'show',
  'sound',
  'speak',
  'spend',
  'start',
  'support',
  'suppose',
  'take',
  'thank',
  'think',
  'turn',
  'want',
  'wear',
  'will',
  'worry',
  'would'},
 'new_verbs': {'break',
  'carry',
  'choose',
  'complain',
  'cook',
  'drop',
  'end',
  'ensure',
  'explain',
  'feed',
  'feel',
  'fight',
  'follow',
  'graduate',
  'imagine',
  'injure',
  'listen',
  'live',
  'make',
  'mind',
  'move',
  'must',
  'need',
  'note',
  'provide',
  'push',
  'put',
  'reach',
  'read',
  'save',
  'seem',
  'shall

["He's going to show us around the new office",
 'Do you think it could still happen, even if it seems unlikely?',
 'Bye for now, see you later this evening!',
 "I'm here to help in any way I can",
 "Shouldn't we spend more time with family?",
 "Instead of complaining, let's find a solution together.",
 'Look at that beautiful sunset over there',
 "I'll get back to your email as soon as I can",
 'Remember when we met at the coffee shop last week?',
 "Don't forget to wear a helmet while cycling",
 "Did the minister really support the community's interests?",
 "Did you hear they're shutting down the factory next month?",
 'Sometimes we just need to mind our own business',
 'Make sure to pay attention to every little detail',
 'Do you like playing the guitar at night?',
 'I should have chosen a different career path',
 'The traffic will be terrible for drivers during the holiday',
 'She might forget to bring her lunch today',
 "I'm planning to organize the office party soon",
 "Let's not 

In [62]:
results['remaining_vocab']

{'verbs': {'accomplish',
  'brainstorm',
  'create',
  'cycle',
  'delay',
  'involve',
  'plant',
  'ride'},
 'vocab': {'6',
  'able',
  'agreed',
  'alright',
  'apparently',
  'call',
  'campaign',
  'charge',
  'construction',
  'disappointing',
  'downtown',
  'empty',
  'exactly',
  'finger',
  'fundraiser',
  'fundraising',
  'glad',
  'handiwork',
  'hey',
  'hmm',
  'maybe',
  'mini',
  'outdoors',
  'perfect',
  'potluck',
  'productive',
  'proud',
  'snack',
  'sore',
  'spot',
  'support',
  'wow'}}

In [None]:
matching_phrases = get_matching_flashcards(vocab_used, phrases)

1. Generate story from broad Longman vocabulary pool
story = generate_story_from_vocab(longman_vocab_dict)

2. Extract all vocabulary used in story
story_vocab = extract_vocab_from_story(story)

3. Compare with Longman dictionary to identify source
used_vocab = compare_vocab_overlap(longman_vocab_dict, story_vocab)

4. get existing flashcards we already have
5. generate new ones
. Generate flashcards for:
    - All story vocabulary that appears in Longman list
    - Common connecting words needed for natural speech
    - Tag cards with which story they appear in
flashcards = generate_flashcards_for_story(story_vocab, story_id)

In [None]:
from pydub import AudioSegment

In [None]:
AudioSegment.silent(100)


In [None]:
vm._load_google_voices("fr-FR")

In [None]:
vm.voices

In [None]:

import azure.cognitiveservices.speech as speechsdk
locale="fr-FR"
language_object = Language.get(locale)
speech_key = os.getenv("AZURE_API_KEY")
if not speech_key:
    print("Warning: AZURE_API_KEY not found in environment variables")



In [None]:


service_region = os.getenv("AZURE_REGION", "eastus")
speech_config = speechsdk.SpeechConfig(
    subscription=speech_key, region=service_region
)
speech_synthesizer = speechsdk.SpeechSynthesizer(
    speech_config=speech_config
)

result = speech_synthesizer.get_voices_async(locale=locale).get()


In [None]:
voice = result.voices[0]

In [None]:
voice.local_name

In [None]:
voice = result.voices[0]


In [None]:
voice.voice_type._name_

In [None]:
all

In [None]:
all = []
for voice in result.voices:
    voice_type = (
        VoiceType.NEURAL
        if voice.voice_type._name_ == "OnlineNeural"
        else VoiceType.STANDARD
    )

    voice_info = VoiceInfo(
        name=voice.local_name,
        provider=VoiceProvider.AZURE,
        voice_type=voice_type,
        gender=voice.gender._name_.upper(),
        language_code=voice.locale,
        country_code=language_object.territory,
        voice_id=voice.short_name,
    )

    all.append(voice_info)



In [None]:
vm._load_azure_voices("fr-FR")

In [None]:
from google.cloud import texttospeech

client = texttospeech.TextToSpeechClient()
response = client.list_voices(language_code="fr-FR")

In [None]:
response.voices[2]

In [None]:
lang.is_valid()

In [None]:
lang.territory_name()

In [None]:
lang.language_name()

In [None]:
from src.translation import translate_from_english

In [None]:
translate_from_english("hello", "cmn-TW")

In [None]:
config._load_config()

In [None]:
config.get_voice_models()

In [None]:
config.voice_manager._load_azure_voices("fr-FR")
config.voice_manager.voices

In [None]:
config.voice_manager._lazy_load_voices()

In [None]:
config.voice_manager.voices

In [None]:
reader = AnkiCollectionReader(ANKI_PATH)

In [None]:
reader.close()

In [None]:
stats = reader.col.stats()

In [None]:
stats.

In [None]:
config._load_config()

In [None]:
import time
from collections import defaultdict
import pandas as pd

def calculate_knowledge_score(collection_path: str, deck_name: str) -> pd.DataFrame:
    """
    Calculate a knowledge score (0-1) for each card in the deck based on review history.
    
    Factors considered:
    - Current interval (longer intervals suggest better knowledge)
    - Ease factor (higher ease suggests better retention)
    - Review success rate (ratio of Good/Easy vs Again buttons)
    - Time since last review (recent successful reviews weighted more)
    - Review time trends (decreasing review times suggest familiarity)
    
    Args:
        collection_path: Path to the .anki2 collection file
        deck_name: Name of the deck to analyze
    
    Returns:
        dict: Card IDs mapped to their knowledge scores and contributing factors
    """
    
    
    scores = {}
    with AnkiCollectionReader(collection_path) as reader:
        # Get deck ID
        deck = reader.col.decks.by_name(deck_name)
        if not deck:
            raise ValueError(f"Deck '{deck_name}' not found")
            
        # Get all cards in deck
        card_ids = reader.col.find_cards(f"did:{deck['id']}")
        
        # For each card, analyze its review history
        for card_id in card_ids:
            card = reader.col.get_card(card_id)
            note = reader.col.get_note(card.nid)
            
            # Get review logs for this card
            reviews = reader.col.db.all(
                "SELECT ease, ivl, factor, time, type FROM revlog WHERE cid = ? ORDER BY id",
                card_id
            )
            
            if not reviews:
                scores[card_id] = {
                    'score': 0,
                    'reason': 'No reviews yet',
                    'note_fields': dict(note.items())
                }
                continue
                
            # Calculate component scores
            
            # 1. Interval score (0-0.4): Longer intervals suggest better knowledge
            max_interval = 365  # Cap at 1 year for scoring
            current_interval = abs(card.ivl)  # Use absolute value to handle negative intervals
            interval_score = min(current_interval / max_interval, 1) * 0.4
            
            # 2. Ease score (0-0.2): Higher ease factors suggest better retention
            min_ease = 1300  # Minimum ease factor
            max_ease = 3100  # Maximum ease factor
            ease_score = (card.factor - min_ease) / (max_ease - min_ease) * 0.2
            ease_score = max(0, min(ease_score, 0.2))  # Clamp between 0-0.2
            
            # 3. Review success score (0-0.3)
            success_count = sum(1 for r in reviews if r[0] >= 3)  # Count Good/Easy
            total_reviews = len(reviews)
            success_score = (success_count / total_reviews) * 0.3 if total_reviews > 0 else 0
            
            # 4. Review time trend score (0-0.1)
            # Lower and/or decreasing review times suggest familiarity
            if len(reviews) >= 3:
                recent_times = [r[3] for r in reviews[-3:]]  # Last 3 review times
                avg_time = sum(recent_times) / len(recent_times)
                time_score = min(1, max(0, (30000 - avg_time) / 30000)) * 0.1  # Scale around 30s
            else:
                time_score = 0
                
            # Calculate final score
            final_score = interval_score + ease_score + success_score + time_score
            
            # Store results
            scores[card_id] = {
                'score': round(final_score, 3),
                'components': {
                    'interval_score': round(interval_score, 3),
                    'ease_score': round(ease_score, 3),
                    'success_score': round(success_score, 3),
                    'time_score': round(time_score, 3)
                },
                'stats': {
                    'current_interval': current_interval,
                    'ease_factor': card.factor,
                    'review_success_rate': round(success_count / total_reviews, 2) if total_reviews > 0 else 0,
                    'total_reviews': total_reviews
                },
                'note_fields': dict(note.items())
            }
    
    # Convert to DataFrame
    rows = []
    for card_id, data in scores.items():
        row = {
            'card_id': card_id
        }
        
        # Add note fields
        row.update(data['note_fields'])
        
        # Add component scores
        if 'components' in data:
            row.update(data['components'])
        
        # Add statistics
        if 'stats' in data:
            row.update(data['stats'])
        
        # Add final score
        row['knowledge_score'] = data['score']
        
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Reorder columns to have card_id first, then content fields, then scores
    score_cols = ['interval_score', 'ease_score', 'success_score', 'time_score', 'knowledge_score']
    stat_cols = ['current_interval', 'ease_factor', 'review_success_rate', 'total_reviews']
    content_cols = [col for col in df.columns if col not in score_cols + stat_cols + ['card_id']]
    
    df = df[['card_id'] + content_cols + stat_cols + score_cols]
    
    return df

def print_knowledge_scores(scores: dict, num_examples: int = 5):
    """
    Print formatted knowledge scores with examples of high and low scoring cards.
    
    Args:
        scores: Dictionary of scores from calculate_knowledge_score()
        num_examples: Number of high/low scoring examples to show
    """
    # Calculate overall statistics
    all_scores = [s['score'] for s in scores.values()]
    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0
    
    print(f"Analysis of {len(scores)} cards:")
    print(f"Average knowledge score: {avg_score:.3f}")
    print(f"Score distribution:")
    
    # Show score distribution
    ranges = [(0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)]
    for low, high in ranges:
        count = sum(1 for s in all_scores if low <= s < high)
        print(f"{low:.1f}-{high:.1f}: {count} cards ({count/len(all_scores)*100:.1f}%)")
    
    # Show examples of highest and lowest scoring cards
    sorted_scores = sorted(scores.items(), key=lambda x: x[1]['score'])
    
    print(f"\nLowest {num_examples} scoring cards:")
    for card_id, data in sorted_scores[:num_examples]:
        print(f"\nCard ID: {card_id}")
        print(f"Score: {data['score']:.3f}")
        if 'components' in data:
            print("Score components:")
            for component, value in data['components'].items():
                print(f"  {component}: {value:.3f}")
        print("Content:")
        for field, content in data['note_fields'].items():
            # Truncate long content for display
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"  {field}: {content_preview}")
    
    print(f"\nHighest {num_examples} scoring cards:")
    for card_id, data in sorted_scores[-num_examples:]:
        print(f"\nCard ID: {card_id}")
        print(f"Score: {data['score']:.3f}")
        if 'components' in data:
            print("Score components:")
            for component, value in data['components'].items():
                print(f"  {component}: {value:.3f}")
        print("Content:")
        for field, content in data['note_fields'].items():
            # Truncate long content for display
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"  {field}: {content_preview}")

# Example usage:
# scores = calculate_knowledge_score("path/to/collection.anki2", "My Deck")
# print_knowledge_scores(scores)

In [None]:
df = calculate_knowledge_score(ANKI_PATH, "RapidRetention - Swedish - LM1000")

# View summary statistics


In [None]:

# Sort by knowledge score
df.groupby("EnglishText").agg({"knowledge_score" : "mean"}).sort_values(by="knowledge_score", ascending=False).head(60)

In [None]:
extract_vo

In [None]:
length_phrase = "4-5 words long, but treat common lexical chunks (I'm going to.., Do you.., Let us.. etc) as a single word"
verbs_per_phrase = "one verb (but OK for an additional adverb if required)"
gcse_phrases = generate_phrases_from_vocab_dict(vocab_dict=gcse_vocab, max_iterations=1)

In [None]:
gcse_phrases

In [None]:
vm._lazy_load_voices()

In [None]:
persian_text = "سلام دنیا"
aud = slow_text_to_speech(persian_text, config_language="target", gender="MALE")

In [None]:
aud

In [None]:
_remove_within_brackets("falling (over)")

In [None]:
import csv
from typing import List, Tuple, Dict, Set
from dataclasses import dataclass
from collections import defaultdict
from src.phrase import generate_phrases_from_vocab_dict

@dataclass
class ProcessingResult:
    pairs: List[Tuple[str, str]]
    problem_lines: Dict[int, str]  # line number -> original line content
    skipped_lines: Dict[int, str]  # line number -> reason for skipping

def process_anki_file(file_path: str) -> ProcessingResult:
    """
    Process tab-separated Anki export data from a file and pair up variations.
    If there are more English variations than French, repeat the French term.
    
    Args:
        file_path: Path to the tab-separated text file
        
    Returns:
        ProcessingResult containing:
        - processed pairs
        - dictionary of problem lines
        - dictionary of skipped lines
    """
    result = []
    problem_lines = {}
    skipped_lines = {}
    
    # Read and process the file
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            # Skip empty lines and comments
            line = line.strip()
            if not line:
                skipped_lines[line_num] = "Empty line"
                continue
            if line.startswith('#'):
                skipped_lines[line_num] = "Comment line"
                continue
                
            # Split line into columns
            columns = line.split('\t')
            if len(columns) < 2:
                skipped_lines[line_num] = "Insufficient columns"
                continue
                
            french_terms = columns[0].strip()
            english_terms = columns[1].strip()
            
            # Split variations
            french_variations = [term.strip() for term in french_terms.split('/')]
            english_variations = [term.strip() for term in english_terms.replace('/', ',').split(',')]
            
            # If there are more English variations than French ones
            if len(french_variations) < len(english_variations):
                if len(french_variations) == 1:  # If there's only one French term, repeat it
                    french_term = french_variations[0]
                    # Filter out 'i.e.' from English variations
                    english_variations = [eng for eng in english_variations if eng.lower() != 'i.e.']
                    # Create pairs with repeated French term
                    for eng in english_variations:
                        if eng:  # Only add if English term is non-empty
                            result.append((french_term, eng))
                else:
                    # If multiple French terms but still fewer than English, record problem
                    problem_lines[line_num] = line
                    print(f"\nWARNING - Complex mismatch on line {line_num}:")
                    print(f"French variations ({len(french_variations)}): {french_variations}")
                    print(f"English variations ({len(english_variations)}): {english_variations}")
            else:
                # Normal case where French variations >= English variations
                for fr, eng in zip(french_variations, english_variations):
                    if fr and eng:  # Only add if both terms are non-empty
                        result.append((fr, eng))
    
    return ProcessingResult(pairs=result, problem_lines=problem_lines, skipped_lines=skipped_lines)


processed_data = process_anki_file(file_path)



In [None]:
def categorize_word_pairs(pairs: List[Tuple[str, str]]) -> Dict[str, Set[Tuple[str, str]]]:
    """
    Categorize word pairs into verbs and vocab based on English 'to ' prefix.
    
    Args:
        pairs: List of tuples containing (french_word, english_word)
        
    Returns:
        Dictionary with 'verbs' and 'vocab' keys containing sets of (french, english) pairs
    """
    vocab_dict = {
        'verbs': set(),
        'vocab': set()
    }
    
    for french, english in pairs:
        # Clean up any trailing/leading whitespace
        french = french.strip()
        english = english.strip()
        
        # Check if it's a verb (starts with 'to ')
        if english.lower().startswith('to ') | english.lower().startswith('to be '):
            # Remove 'to ' and add to verbs
            english_cleaned = english[3:].strip()  # Remove 'to ' prefix
            vocab_dict['verbs'].add(english_cleaned)
        else:
            # Add to vocab
            vocab_dict['vocab'].add(english)
    
    vocab_dict['vocab'] = list(vocab_dict['vocab'])
    vocab_dict['verbs'] = list(vocab_dict['verbs'])
    return vocab_dict

In [None]:
gcse_dict = categorize_word_pairs(processed_data.pairs)

In [None]:
from src.utils import save_json


save_json(gcse_dict, "../outputs/gcse_dict.json")

In [None]:
some_phrases = generate_phrases_from_vocab_dict(gcse_dict, 1)

In [None]:
some_phrases

In [None]:
speech_key

In [None]:
import os
import json
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
load_dotenv()
# Configure speech service
speech_key = os.getenv("AZURE_API_KEY")
service_region = "eastus"  # Default region
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

# Get available voices
result = speech_synthesizer.get_voices_async().get()



In [None]:
vars(voice)

In [None]:
for voice in result.voices:
    print(voice._short_name)

In [None]:
for voice in result.voices:
    print(voice.)

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk
from IPython.display import Audio
import io

# Configure speech service
speech_key = os.getenv("AZURE_API_KEY")
service_region = "eastus"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Note: the voice setting will not overwrite the voice element in input SSML.
speech_config.speech_synthesis_voice_name = "en-US-AmberNeural"

text = "Hello World!"

# use the default speaker as audio output.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

result = speech_synthesizer.speak_text_async(text).get()
# Check result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}]".format(text))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))



In [None]:
from src.audio_generation import play_audio

display(Audio(result.audio_data, autoplay=True))

In [None]:
# # Load environment variables from .env file
# load_dotenv()
# #from src.config_loader import config
# from src.utils import create_html_story, create_test_story_dict, load_json
# from src.audio_generation import text_to_speech
# from src.translation import tokenize_text
# from src.anki_tools import generate_wiktionary_links
from src.nlp import filter_matching_phrases
from src.utils import load_json


In [None]:
import json


with open("../data/gcse_vocab_list_cambridge.json", "r") as gcse:
    gcse_dict = json.load(gcse)

In [None]:
with open("../data/longman_phrases_convo_1000.txt", "r") as core:
    core_phrases = [line.strip("\n") for line in core.readlines()]


In [None]:
gcse_phrases = filter_matching_phrases(core_phrases, gcse_dict)

In [None]:
gcse_phrases

In [None]:
image_dict = load_json("..\data\longman_phrase_images\phrase_image_dict.json")

In [None]:
phrases = []
for key in image_dict:
    if isinstance(image_dict.get(key), str):
        continue
    else:
        phrases.append(image_dict[key]['phrase'])

In [None]:
phrases

In [None]:
import random


test_phrases = random.sample(phrases, 50)

In [None]:
import json
import os
from pathlib import Path

def merge_json_files(directory_path):
    """
    Merge multiple JSON files containing transformed phrases into a single JSON file,
    removing duplicates based on both original and conversational fields.
    
    Args:
        directory_path (str): Path to the directory containing JSON files
        
    Returns:
        dict: Merged dictionary containing unique transformed phrases
    """
    # Use a set to track unique phrases (as tuples of original and conversational)
    unique_phrases = set()
    all_phrases = []
    
    # Convert directory path to Path object
    dir_path = Path(directory_path)
    
    # Counter for tracking statistics
    total_phrases = 0
    
    # Iterate through all JSON files in the directory
    for json_file in dir_path.glob('*.json'):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # Extract transformed_phrases from each file
                if 'transformed_phrases' in data:
                    phrases = data['transformed_phrases']
                    total_phrases += len(phrases)
                    
                    # Process each phrase
                    for phrase in phrases:
                        # Create tuple of the phrase fields for uniqueness checking
                        phrase_tuple = (phrase['original'], phrase['conversational'])
                        
                        # Only add if we haven't seen this combination before
                        if phrase_tuple not in unique_phrases:
                            unique_phrases.add(phrase_tuple)
                            all_phrases.append(phrase)
                else:
                    print(f"Warning: 'transformed_phrases' not found in {json_file}")
                    
        except json.JSONDecodeError:
            print(f"Error: Could not parse JSON from {json_file}")
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")
    
    # Create merged dictionary
    merged_data = {
        "transformed_phrases": all_phrases
    }
    
    # Save merged data to a new JSON file
    output_path = dir_path / 'merged_phrases.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, indent=2, ensure_ascii=False)
    
    # Print statistics
    duplicates = total_phrases - len(all_phrases)
    print(f"Processing complete:")
    print(f"- Total phrases processed: {total_phrases}")
    print(f"- Unique phrases: {len(all_phrases)}")
    print(f"- Duplicates removed: {duplicates}")
    print(f"Output saved to: {output_path}")
    
    return merged_data

In [None]:
merged_dat = merge_json_files("../outputs/phrase_changes")

In [None]:
def save_conversational_phrases(transformed_phrases, output_file:str ="../data/longman_phrases_convo_1000.txt") -> None:
    """
    Extract and save conversational phrases to a text file.
    
    Args:
        transformed_phrases: List of dictionaries containing 'original' and 'conversational' phrases
        output_file: Path to the output text file
        
    Prints summary of the operation.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            for phrase_pair in transformed_phrases:
                f.write(phrase_pair['conversational'] + '\n')
                
        # Count phrases saved
        phrase_count = len(transformed_phrases)
        print(f"\nOperation Summary:")
        print(f"Successfully saved {phrase_count} phrases to {output_file}")
        
    except Exception as e:
        print(f"Error saving phrases to {output_file}: {str(e)}")
        raise

In [None]:
save_conversational_phrases(merged_dat["transformed_phrases"])

In [None]:
import os
import shutil
from typing import Dict, List
from src.utils import clean_filename

def copy_rename_phrase_images(transformed_phrases: List[Dict[str, str]], image_dir: str="..\data\longman_phrase_images") -> None:
    """
    Copy and rename image files based on transformed phrases.
    
    Args:
        transformed_phrases: List of dictionaries containing 'original' and 'conversational' phrases
        image_dir: Directory containing the image files
        
    Prints summary of operations and lists any missing original images.
    """
    successful_copies = 0
    missing_originals = []
    
    # Ensure image directory exists
    if not os.path.exists(image_dir):
        raise FileNotFoundError(f"Image directory not found: {image_dir}")
        
    for phrase_pair in transformed_phrases:
        original_phrase = phrase_pair['original']
        conversational_phrase = phrase_pair['conversational']
        
        # Generate filenames
        original_filename = clean_filename(original_phrase) + '.png'
        new_filename = clean_filename(conversational_phrase) + '.png'
        
        original_path = os.path.join(image_dir, original_filename)
        new_path = os.path.join(image_dir, new_filename)
        
        # Check if original file exists
        if not os.path.exists(original_path):
            missing_originals.append(original_phrase)
            continue
            
        # Skip if destination file already exists
        if os.path.exists(new_path):
            print(f"Warning: Destination file already exists, skipping: {new_filename}")
            continue
            
        try:
            # Copy the file
            shutil.copy2(original_path, new_path)
            successful_copies += 1
        except Exception as e:
            print(f"Error copying {original_filename} to {new_filename}: {str(e)}")
            
    # Print summary
    print("\nOperation Summary:")
    print(f"Successfully copied and renamed: {successful_copies} images")
    print(f"Missing original images: {len(missing_originals)}")
    
    if missing_originals:
        print("\nMissing original images for these phrases:")
        for phrase in missing_originals:
            print(f"- {phrase}")
            
    return successful_copies, missing_originals

In [None]:
successful_copies, missing_originals = copy_rename_phrase_images(merged_dat['transformed_phrases'])

In [None]:
def next_phrase_chunk():
    for index in range(0,len(phrases), 50):
        yield phrases[index:index+50]

phrase_iter= next_phrase_chunk()

In [None]:
next(phrase_iter)

In [None]:
prompt = f"""Transform each sentence into a more natural, conversational phrase. Each original phrase has an associated teaching image, so the core situation and meaning MUST remain identical. Your response should be a valid JSON object with this structure:

{{
  "transformed_phrases": [
    {{
      "original": "<original phrase>",
      "conversational": "<transformed phrase>"
    }}
  ]
}}

Rules for transformation:

1. Keep ALL key vocabulary from the original (nouns, verbs, adjectives) - as these phrases are for learning that vocab

2. Maintain the EXACT same situation/scene as the original, since it matches an existing image

3. Transform into the most natural way someone would express this idea in conversation, mainly replace phrases that are narrative and passive. Replace with:

   - Imperative ("Don't be late...")
   - Simple statements ("The traffic was terrible...")
   - First-person expressions ("I enjoy...")
   - Questions ("Shall we...?", "Do you ...?", "Did they...?" etc)
4. Keep the original phrase if a change isn't required or modifying the phrase would result in something unnatural sounding

Examples:

{{
"original": "We watched the sunrise over the mountains",
"conversational": "We watched the sunrise over the mountains"
}},
{{
"original": "She will call you as soon as possible",
"conversational": "She will call you as soon as possible"
}},
{{
"original": "They joined the protest against high taxes",
"conversational": "Did you join the protest against high taxes?"
}},
{{
"original": "He will receive an award for his work",
"conversational": "I'm receiving an award for my work"
}},
{{
"original": "She likes to play with her pet cat",
"conversational": "Do you like playing with your pet cat?"
}},
{{
"original": "He covered his face with his hands",
"conversational": "Cover your face with your hands"
}},

Here are the phrases to transform:

{next(phrase_iter)}

Output only valid JSON following the structure above."""

pyperclip.copy(prompt)

In [None]:
import pyperclip
pyperclip.copy(prompt)

In [None]:
[image_dict.get(key).get('phrase', '') for key in image_dict]

In [None]:
resp = tokenize_text("hello world", language_code="en")

In [None]:
resp = generate_wiktionary_links("こんにちは世界", "Japanese", "ja")

In [None]:
resp

In [None]:
audio_segment = slow_text_to_speech(
        text="let's speak slowly and clearly",
    )

In [None]:
audio_segment

In [None]:
from src.audio_generation import slow_text_to_speech
from IPython.display import Audio
import io

# Test cases with languages, voice names, and potential issues
test_phrases = [
    # English with apostrophes
    ("I can't believe it!", "en-GB", "en-GB-Wavenet-D"),
    ("Don't you'll I'm they're", "en-GB", "en-GB-Wavenet-D"),
    
    # HTML entities that might appear
    ("Let&#39;s go &amp; have fun!", "en-US", "en-US-Wavenet-D"),
    
    # Italian with apostrophes
    ("L'italiano è bellissimo", "it-IT", "it-IT-Wavenet-A"),
    
    # Japanese (no apostrophes but needs tokenization)
    ("私は日本語を勉強しています", "ja-JP", "ja-JP-Wavenet-B"),
    
    # Chinese test
    ("我正在学习中文", "zh-CN", "cmn-CN-Wavenet-A"),
    
]

# Test each phrase and play the audio
for text, lang_code, voice in test_phrases[0:2]:
    print(f"\nTesting: {text}")
    print(f"Language: {lang_code}")
    print(f"Voice: {voice}")
    
    audio_segment = slow_text_to_speech(
        text, 
        language_code=lang_code,
        voice_name=voice
    )
    
    # Convert to format playable in notebook
    buffer = io.BytesIO()
    audio_segment.export(buffer, format="wav")
    buffer.seek(0)
    
    # Display audio player
    display(Audio(buffer.read(), rate=audio_segment.frame_rate))

In [None]:
clipper = EnhancedSSMLClipper(
    word_rate="0.85",    # Very slow
    word_pitch="-1st",   # Slightly lower
    break_time="300ms",  # Longer breaks
)



In [None]:
# Russian text: "Hello! How are you? I am learning Russian."
russian_text = "Здравствуйте! Как дела? Я изучаю русский язык."

# Russian neural voice name
russian_voice = "ru-RU-Standard-B"  # Female voice
clipper.synthesize_speech(russian_text, "enhanced.mp3", russian_voice)

In [None]:
audio = text_to_speech(russian_text, "ru", russian_voice)

In [None]:
italian_voice = "it-IT-Neural2-C"
italian_text1 = "Buongiorno! Come stai? Oggi vado in spiaggia."
clipper.synthesize_speech(italian_text1, "enhanced.mp3", italian_voice)

In [None]:
from src.anki_tools import generate_wiktionary_links_non_english


def test_wiktionary_links():
    """Test function to demonstrate usage."""
    test_cases = [
        ("goodbye England", "uk"),  # Ukrainian
        ("book reading", "sv"),  # Swedish
        ("coffee shop", "ja"),  # Japanese
    ]
    
    for phrase, lang_code in test_cases:
        print(f"\nTesting {lang_code} Wiktionary links for: {phrase}")
        result = generate_wiktionary_links_non_english(phrase, lang_code)
        print(f"Result: {result}")

In [None]:
config.target_language_voice_models["language_code"]