In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from dotenv import load_dotenv
import sys
import os
import pickle

# Add the parent directory of 'src' to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Load environment variables from .env file
load_dotenv()
from src.config_loader import config
from src.utils import create_html_story, create_test_story_dict
from src.audio_generation import text_to_speech


In [22]:
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_nonsilent
import os
import tempfile
from google.cloud import texttospeech
import re

class SSMLWordClipper:
    def __init__(self, break_strength='strong', break_time='250ms'):
        """
        Initialize the SSML Word Clipper.
        
        Args:
            break_strength (str): SSML break strength ('none', 'x-weak', 'weak', 
                                'medium', 'strong', 'x-strong')
            break_time (str): Break time in milliseconds (e.g., '250ms')
        """
        self.client = texttospeech.TextToSpeechClient()
        self.break_strength = break_strength
        self.break_time = break_time
        
    def create_ssml(self, text):
        """
        Convert plain text to SSML with breaks between words.
        
        Args:
            text (str): Input text
            
        Returns:
            str: SSML formatted text with breaks between words
        """
        # Clean the input text
        text = text.strip()
        
        # Split into words while preserving punctuation
        words = re.findall(r'\b\w+\b|[.,!?;]', text)
        
        # Create SSML with breaks
        ssml_parts = ['<speak>']
        
        for i, word in enumerate(words):
            ssml_parts.append(word)
            
            # Add break after each word except the last one and punctuation
            if i < len(words) - 1 and not re.match(r'[.,!?;]', words[i + 1]):
                ssml_parts.append(
                    f'<break strength="{self.break_strength}" time="{self.break_time}"/>'
                )
                
        ssml_parts.append('</speak>')
        
        return ' '.join(ssml_parts)
    
    def synthesize_speech(self, text, output_file="output.mp3"):
        """
        Convert text to speech with distinct word breaks using SSML.
        
        Args:
            text (str): Input text
            output_file (str): Path to save the output audio file
            
        Returns:
            str: Path to the output audio file
        """
        # Generate SSML
        ssml_text = self.create_ssml(text)
        
        # Configure the voice
        voice = texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-D",  # Using a neural voice for better quality
            ssml_gender=texttospeech.SsmlVoiceGender.MALE
        )
        
        # Configure the audio
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            speaking_rate=0.6  # Normal speaking rate
        )
        
        # Create synthesis input
        synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
        
        # Perform the text-to-speech request
        response = self.client.synthesize_speech(
            input=synthesis_input,
            voice=voice,
            audio_config=audio_config
        )
        
        self.audio = response.audio_content
        # Write the response to the output file
        with open(output_file, "wb") as out:
            out.write(response.audio_content)
            
        return output_file



In [25]:
    # Basic usage
# Default configuration (strong breaks, 250ms)
clipper = SSMLWordClipper()

# Example 2: Longer breaks
clipper_long = SSMLWordClipper(break_strength='medium', break_time='300ms')
clipper_long.synthesize_speech(text, "output_long_breaks.mp3")



'output_long_breaks.mp3'

In [49]:
class EnhancedSSMLClipper:
    def __init__(self, 
                 word_rate="0.85",        # Slower rate for clearer words
                 word_pitch="-2st",       # Slightly lower pitch for clarity
                 word_volume="0dB",       # Normal volume
                 break_time="250ms",      # Default break time
                 sentence_break="500ms"): # Longer break for sentences
        """
        Initialize Enhanced SSML Clipper with more natural speech parameters.
        
        Args:
            word_rate (str): Speaking rate for each word (0.85 = 85% of normal speed)
            word_pitch (str): Pitch adjustment in semitones (st) or Hz
            word_volume (str): Volume adjustment in dB
            break_time (str): Break time between words
            sentence_break (str): Break time between sentences
        """
        self.client = texttospeech.TextToSpeechClient()
        self.word_rate = word_rate
        self.word_pitch = word_pitch
        self.word_volume = word_volume
        self.break_time = break_time
        self.sentence_break = sentence_break
        
    def wrap_word_with_prosody(self, word):
        """
        Wrap each word with prosody tags for better control.
        """
        return (f'<prosody rate="{self.word_rate}" '
                f'pitch="{self.word_pitch}" '
                f'volume="{self.word_volume}">'
                f'{word}'
                f'</prosody>')
    
    def get_break_length(self, next_char):
        """
        Determine break length based on context.
        """
        if next_char in '.!?':
            return self.sentence_break
        elif next_char in ',;':
            return str(int(self.break_time[:-2]) * 1.5) + 'ms'
        else:
            return self.break_time
    
    def create_ssml(self, text):
        """
        Convert plain text to SSML with enhanced word control.
        
        Args:
            text (str): Input text
        """
        text = text.strip()
        
        # Split into words while preserving punctuation
        # Enhanced regex to capture more punctuation and preserve spacing
        words = re.findall(r'\b[\w\']+\b|[.,!?;]|\s+', text)
        
        ssml_parts = ['<speak>']
        
        i = 0
        while i < len(words):
            word = words[i].strip()
            
            # Skip empty strings and pure whitespace
            if not word:
                i += 1
                continue
                
            # Handle punctuation
            if re.match(r'[.,!?;]', word):
                ssml_parts.append(word)
                if i < len(words) - 1:  # If not the last token
                    break_time = self.get_break_length(word)
                    ssml_parts.append(f'<break time="{break_time}"/>')
            else:
                # Wrap non-punctuation words with prosody
                ssml_parts.append(self.wrap_word_with_prosody(word))
                
                # Add break if not followed by punctuation
                next_word = words[i + 1] if i < len(words) - 1 else None
                if next_word and not re.match(r'[.,!?;]', next_word):
                    ssml_parts.append(f'<break time="{self.break_time}"/>')
            
            i += 1
        
        ssml_parts.append('</speak>')
        return ' '.join(ssml_parts)
    
    def synthesize_speech(self, text, output_file="output.mp3", voice_name="en-US-Neural2-D"):
        """
        Convert text to speech with enhanced word clarity.
        
        Args:
            text (str): Input text
            output_file (str): Output file path
            voice_name (str): Name of the voice to use
        """
        ssml_text = self.create_ssml(text)
        print(f"Generated SSML:\n{ssml_text}\n")  # For debugging
        
        voice = texttospeech.VoiceSelectionParams(
            language_code="it-IT",
            name=voice_name,
            ssml_gender=texttospeech.SsmlVoiceGender.MALE
        )
        
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            speaking_rate=1.0,  # Base rate (will be modified by prosody tags)
            pitch=0.0,          # Base pitch (will be modified by prosody tags)
            volume_gain_db=0.0  # Base volume (will be modified by prosody tags)
        )
        
        synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
        
        response = self.client.synthesize_speech(
            input=synthesis_input,
            voice=voice,
            audio_config=audio_config
        )
        
        with open(output_file, "wb") as out:
            out.write(response.audio_content)
        
        return output_file

In [50]:
clipper = EnhancedSSMLClipper(
    word_rate="0.65",    # Very slow
    word_pitch="-1st",   # Slightly lower
    break_time="300ms",  # Longer breaks
)



In [41]:
# Russian text: "Hello! How are you? I am learning Russian."
russian_text = "Здравствуйте! Как дела? Я изучаю русский язык."

# Russian neural voice name
russian_voice = "ru-RU-Standard-B"  # Female voice
clipper.synthesize_speech(russian_text, "enhanced.mp3", russian_voice)

Generated SSML:
<speak> <prosody rate="0.65" pitch="-1st" volume="0dB">Здравствуйте</prosody> ! <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Как</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">дела</prosody> ? <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Я</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">изучаю</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">русский</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">язык</prosody> . </speak>



'enhanced.mp3'

In [35]:
audio = text_to_speech(russian_text, "ru", russian_voice)

In [51]:
italian_voice = "it-IT-Neural2-C"
italian_text1 = "Buongiorno! Come stai? Oggi vado in spiaggia."
clipper.synthesize_speech(italian_text1, "enhanced.mp3", italian_voice)

Generated SSML:
<speak> <prosody rate="0.65" pitch="-1st" volume="0dB">Buongiorno</prosody> ! <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Come</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">stai</prosody> ? <break time="500ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">Oggi</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">vado</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">in</prosody> <break time="300ms"/> <prosody rate="0.65" pitch="-1st" volume="0dB">spiaggia</prosody> . </speak>



'enhanced.mp3'

In [9]:
from src.anki_tools import generate_wiktionary_links_non_english


def test_wiktionary_links():
    """Test function to demonstrate usage."""
    test_cases = [
        ("goodbye England", "uk"),  # Ukrainian
        ("book reading", "sv"),  # Swedish
        ("coffee shop", "ja"),  # Japanese
    ]
    
    for phrase, lang_code in test_cases:
        print(f"\nTesting {lang_code} Wiktionary links for: {phrase}")
        result = generate_wiktionary_links_non_english(phrase, lang_code)
        print(f"Result: {result}")

In [13]:
test_wiktionary_links()


Testing uk Wiktionary links for: goodbye England
native url is: https://uk.wiktionary.org/wiki/goodbye
native url is: https://uk.wiktionary.org/wiki/england
Result: <a href="https://uk.wiktionary.org/wiki/goodbye#Англійська">goodbye</a> <a href="https://en.wiktionary.org/wiki/england#English">England</a>

Testing sv Wiktionary links for: book reading
native url is: https://sv.wiktionary.org/wiki/book
native url is: https://sv.wiktionary.org/wiki/reading
Result: <a href="https://sv.wiktionary.org/wiki/book#Engelska">book</a> <a href="https://sv.wiktionary.org/wiki/reading#Engelska">reading</a>

Testing ja Wiktionary links for: coffee shop
native url is: https://ja.wiktionary.org/wiki/coffee
native url is: https://ja.wiktionary.org/wiki/shop
Result: <a href="https://ja.wiktionary.org/wiki/coffee#英語">coffee</a> <a href="https://ja.wiktionary.org/wiki/shop#英語">shop</a>
