In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from dotenv import load_dotenv
import sys
import os
import pickle

# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Load environment variables from .env file
load_dotenv()
from src.config_loader import config
from src.utils import create_html_story, create_test_story_dict
from src.audio_generation import text_to_speech
from src.translation import tokenize_text
from src.anki_tools import generate_wiktionary_links


In [4]:
resp = tokenize_text("hello world", language_code="en")

In [8]:
resp = generate_wiktionary_links("こんにちは世界", "Japanese", "ja")

In [9]:
resp

'<a href="https://en.wiktionary.org/wiki/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF#Japanese">こんにちは</a> <a href="https://en.wiktionary.org/wiki/%E4%B8%96%E7%95%8C#Japanese">世界</a>'

In [12]:
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_nonsilent
import os
import tempfile
from google.cloud import texttospeech
import re

class SSMLWordClipper:
    def __init__(self, break_strength='strong', break_time='250ms'):
        """
        Initialize the SSML Word Clipper.
        
        Args:
            break_strength (str): SSML break strength ('none', 'x-weak', 'weak', 
                                'medium', 'strong', 'x-strong')
            break_time (str): Break time in milliseconds (e.g., '250ms')
        """
        self.client = texttospeech.TextToSpeechClient()
        self.break_strength = break_strength
        self.break_time = break_time
        
    def create_ssml(self, text):
        """
        Convert plain text to SSML with breaks between words.
        
        Args:
            text (str): Input text
            
        Returns:
            str: SSML formatted text with breaks between words
        """
        # Clean the input text
        text = text.strip()
        
        # Split into words while preserving punctuation
        words = re.findall(r'\b\w+\b|[.,!?;]', text)
        
        # Create SSML with breaks
        ssml_parts = ['<speak>']
        
        for i, word in enumerate(words):
            ssml_parts.append(word)
            
            # Add break after each word except the last one and punctuation
            if i < len(words) - 1 and not re.match(r'[.,!?;]', words[i + 1]):
                ssml_parts.append(
                    f'<break strength="{self.break_strength}" time="{self.break_time}"/>'
                )
                
        ssml_parts.append('</speak>')
        
        return ' '.join(ssml_parts)
    
    def synthesize_speech(self, text, output_file="output.mp3"):
        """
        Convert text to speech with distinct word breaks using SSML.
        
        Args:
            text (str): Input text
            output_file (str): Path to save the output audio file
            
        Returns:
            str: Path to the output audio file
        """
        # Generate SSML
        ssml_text = self.create_ssml(text)
        
        # Configure the voice
        voice = texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-D",  # Using a neural voice for better quality
            ssml_gender=texttospeech.SsmlVoiceGender.MALE
        )
        
        # Configure the audio
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            speaking_rate=0.6  # Normal speaking rate
        )
        
        # Create synthesis input
        synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
        
        # Perform the text-to-speech request
        response = self.client.synthesize_speech(
            input=synthesis_input,
            voice=voice,
            audio_config=audio_config
        )
        
        self.audio = response.audio_content
        # Write the response to the output file
        with open(output_file, "wb") as out:
            out.write(response.audio_content)
            
        return output_file



In [None]:
# Test phrase with various gap durations
test_phrase = "Let's practice speaking slowly and clearly"
gaps_to_test = [
    100,
    500
]

# Test settings
language_code = "en-US"
voice_name = "en-US-Wavenet-D"

# Test each gap duration
for gap in gaps_to_test:
    print(f"\nTesting with word gap: {gap}")
    
    audio_segment = slow_text_to_speech(
        text=test_phrase,
        language_code=language_code,
        voice_name=voice_name,
        word_break_time=gap
    )
    
    # Print duration to see impact on overall length
    print(f"Total audio duration: {len(audio_segment)}ms")
    
    # Convert to format playable in notebook
    buffer = io.BytesIO()
    audio_segment.export(buffer, format="wav")
    buffer.seek(0)
    
    # Display audio player
    display(Audio(buffer.read(), rate=audio_segment.frame_rate))


Testing with word gap: 100ms
Total audio duration: 3576ms



Testing with word gap: 200ms
Total audio duration: 4056ms



Testing with word gap: 300ms
Total audio duration: 4560ms



Testing with word gap: 400ms
Total audio duration: 5064ms



Testing with word gap: 500ms
Total audio duration: 5568ms


In [None]:
from src.audio_generation import slow_text_to_speech
from IPython.display import Audio
import io

# Test cases with languages, voice names, and potential issues
test_phrases = [
    # English with apostrophes
    ("I can't believe it!", "en-US", "en-US-Wavenet-D"),
    ("Don't you'll I'm they're", "en-US", "en-US-Wavenet-D"),
    
    # HTML entities that might appear
    ("Let&#39;s go &amp; have fun!", "en-US", "en-US-Wavenet-D"),
    
    # Italian with apostrophes
    ("L'italiano è bellissimo", "it-IT", "it-IT-Wavenet-A"),
    
    # Japanese (no apostrophes but needs tokenization)
    ("私は日本語を勉強しています", "ja-JP", "ja-JP-Wavenet-B"),
    
    # Chinese test
    ("我正在学习中文", "zh-CN", "cmn-CN-Wavenet-A"),
    
]

# Test each phrase and play the audio
for text, lang_code, voice in test_phrases:
    print(f"\nTesting: {text}")
    print(f"Language: {lang_code}")
    print(f"Voice: {voice}")
    
    audio_segment = slow_text_to_speech(
        text, 
        language_code=lang_code,
        voice_name=voice
    )
    
    # Convert to format playable in notebook
    buffer = io.BytesIO()
    audio_segment.export(buffer, format="wav")
    buffer.seek(0)
    
    # Display audio player
    display(Audio(buffer.read(), rate=audio_segment.frame_rate))

FFmpeg path added to system PATH: C:\Program Files\ffmpeg-7.0-essentials_build\bin
Config file has been modified. Reloading...
Language name: Swedish determined from code sv
Successfully loaded config from: y:\Python Scripts\audio-language-trainer\src\config.json
Multiple country codes available for en: en-AU, en-GB, en-IN, en-US

Testing: I can't believe it!
Language: en-US
Voice: en-US-Wavenet-D



Testing: Don't you'll I'm they're
Language: en-US
Voice: en-US-Wavenet-D



Testing: Let&#39;s go &amp; have fun!
Language: en-US
Voice: en-US-Wavenet-D



Testing: L'italiano è bellissimo
Language: it-IT
Voice: it-IT-Wavenet-A



Testing: 私は日本語を勉強しています
Language: ja-JP
Voice: ja-JP-Wavenet-B



Testing: 我正在学习中文
Language: zh-CN
Voice: cmn-CN-Wavenet-A



Testing: ฉันกำลังเรียนภาษาไทย
Language: th-TH
Voice: th-TH-Wavenet-A
API Tokenization failed: 400 The language th is not supported for syntax analysis.


InvalidArgument: 400 Voice 'th-TH-Wavenet-A' does not exist. Is it misspelled?

In [16]:
clipper = EnhancedSSMLClipper(
    word_rate="0.85",    # Very slow
    word_pitch="-1st",   # Slightly lower
    break_time="300ms",  # Longer breaks
)



In [14]:
# Russian text: "Hello! How are you? I am learning Russian."
russian_text = "Здравствуйте! Как дела? Я изучаю русский язык."

# Russian neural voice name
russian_voice = "ru-RU-Standard-B"  # Female voice
clipper.synthesize_speech(russian_text, "enhanced.mp3", russian_voice)

Generated SSML:
<speak> <prosody rate="0.65" pitch="-1st" volume="0dB">Здравствуйте</prosody> ! <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Как</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">дела</prosody> ? <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Я</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">изучаю</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">русский</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">язык</prosody> . </speak>



'enhanced.mp3'

In [35]:
audio = text_to_speech(russian_text, "ru", russian_voice)

In [17]:
italian_voice = "it-IT-Neural2-C"
italian_text1 = "Buongiorno! Come stai? Oggi vado in spiaggia."
clipper.synthesize_speech(italian_text1, "enhanced.mp3", italian_voice)

Generated SSML:
<speak> <prosody rate="0.85" pitch="-1st" volume="0dB">Buongiorno</prosody> ! <break time="500ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">Come</prosody> <break time="300ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">stai</prosody> ? <break time="500ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">Oggi</prosody> <break time="300ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">vado</prosody> <break time="300ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">in</prosody> <break time="300ms"/> <prosody rate="0.85" pitch="-1st" volume="0dB">spiaggia</prosody> . </speak>



'enhanced.mp3'

In [9]:
from src.anki_tools import generate_wiktionary_links_non_english


def test_wiktionary_links():
    """Test function to demonstrate usage."""
    test_cases = [
        ("goodbye England", "uk"),  # Ukrainian
        ("book reading", "sv"),  # Swedish
        ("coffee shop", "ja"),  # Japanese
    ]
    
    for phrase, lang_code in test_cases:
        print(f"\nTesting {lang_code} Wiktionary links for: {phrase}")
        result = generate_wiktionary_links_non_english(phrase, lang_code)
        print(f"Result: {result}")

In [13]:
test_wiktionary_links()


Testing uk Wiktionary links for: goodbye England
native url is: https://uk.wiktionary.org/wiki/goodbye
native url is: https://uk.wiktionary.org/wiki/england
Result: <a href="https://uk.wiktionary.org/wiki/goodbye#Англійська">goodbye</a> <a href="https://en.wiktionary.org/wiki/england#English">England</a>

Testing sv Wiktionary links for: book reading
native url is: https://sv.wiktionary.org/wiki/book
native url is: https://sv.wiktionary.org/wiki/reading
Result: <a href="https://sv.wiktionary.org/wiki/book#Engelska">book</a> <a href="https://sv.wiktionary.org/wiki/reading#Engelska">reading</a>

Testing ja Wiktionary links for: coffee shop
native url is: https://ja.wiktionary.org/wiki/coffee
native url is: https://ja.wiktionary.org/wiki/shop
Result: <a href="https://ja.wiktionary.org/wiki/coffee#英語">coffee</a> <a href="https://ja.wiktionary.org/wiki/shop#英語">shop</a>
