In [10]:

import os
import re
import json
import logging
import tempfile
from typing import Dict, List, Optional, Tuple

import ipywidgets as widgets
from IPython.display import display, clear_output, Audio, HTML
import torch
import torchaudio
from transformers import pipeline
from groq import Groq

In [11]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [12]:
class SpeechGenerator:
    """Class for generating and managing AI-generated speeches."""
    
    # Define speech style templates and audience guidance as class attributes
    STYLE_TEMPLATES = {
        "formal": "Write a formal {duration}-minute speech about '{topic}' suitable for a professional audience.",
        "casual": "Write a casual, friendly {duration}-minute speech about '{topic}'.",
        "motivational": "Write an inspiring {duration}-minute motivational speech about '{topic}' that energizes the audience.",
        "persuasive": "Write a compelling {duration}-minute persuasive speech about '{topic}' to change minds.",
        "instructional": "Write a step-by-step {duration}-minute instructional speech on '{topic}'.",
        "debate": "Write a {duration}-minute debate speech about '{topic}' with strong arguments and counterpoints.",
        "humorous": "Write a funny {duration}-minute speech about '{topic}' with appropriate humor and wit.",
        "storytelling": "Write an engaging {duration}-minute speech about '{topic}' using storytelling techniques.",
    }
    
    AUDIENCE_GUIDANCE = {
        "general": "Make the speech accessible to a general audience with no specialized knowledge.",
        "experts": "Include technical depth suitable for experts in the field.",
        "children": "Use simple, engaging language and examples suitable for kids.",
        "students": "Be educational and engaging for a student audience.",
        "executives": "Focus on strategic implications and leadership perspectives.",
        "international": "Use globally accessible references and minimize culturally specific idioms.",
    }
    
    # LLM models available for speech generation
    AVAILABLE_MODELS = {
        "llama3-8b-8192": {"description": "Balanced model for general use", "max_tokens": 8192},
        "llama3-70b-8192": {"description": "Advanced model with better quality", "max_tokens": 8192},
        "gemma-7b-it": {"description": "Efficient model for simpler tasks", "max_tokens": 4096}
    }
    
    # TTS voice options with their characteristics
    TTS_VOICES = {
        "en_male_neutral": {"gender": "male", "style": "neutral", "description": "Clear, professional male voice"},
        "en_female_neutral": {"gender": "female", "style": "neutral", "description": "Clear, professional female voice"},
        "en_male_enthusiastic": {"gender": "male", "style": "enthusiastic", "description": "Energetic male voice for motivational content"},
        "en_female_enthusiastic": {"gender": "female", "style": "enthusiastic", "description": "Energetic female voice for motivational content"},
        "en_male_formal": {"gender": "male", "style": "formal", "description": "Formal, authoritative male voice"},
        "en_female_formal": {"gender": "female", "style": "formal", "description": "Formal, authoritative female voice"},
    }
    
    def __init__(self, api_key_path: Optional[str] = None):
        """
        Initialize the SpeechGenerator with optional API key from file.
        
        Args:
            api_key_path: Path to a JSON file containing the Groq API key.
        """
        self.api_key = None
        self.client = None
        self.output_folder = "speech_outputs"
        self.audio_folder = os.path.join(self.output_folder, "audio")
        self.history = []
        self.tts_engine = None
        
        # Create output folders if they don't exist
        for folder in [self.output_folder, self.audio_folder]:
            if not os.path.exists(folder):
                os.makedirs(folder)
        
        # Initialize text-to-speech engine
        self._initialize_tts_engine()
                
        # Load API key if provided
        if api_key_path:
            self.load_api_key(api_key_path)
    
    def _initialize_tts_engine(self):
        """Initialize the text-to-speech engine using transformers."""
        try:
            # Check if CUDA (GPU) is available
            device = "cuda" if torch.cuda.is_available() else "cpu"
            
            # Initialize the TTS pipeline
            self.tts_engine = pipeline(
                "text-to-speech", 
                model="microsoft/speecht5_tts", 
                device=device
            )
            
            # Load vocoder for better audio quality
            self.vocoder = pipeline(
                "text-to-speech",
                model="microsoft/speecht5_hifigan",
                device=device
            )
            
            logger.info(f"TTS engine initialized on {device}")
        except Exception as e:
            logger.error(f"Failed to initialize TTS engine: {str(e)}")
            self.tts_engine = None
    
    def load_api_key(self, api_key_path: str) -> None:
        """
        Load API key from a JSON file.
        
        Args:
            api_key_path: Path to a JSON file containing the Groq API key.
        
        Raises:
            FileNotFoundError: If the API key file doesn't exist.
            KeyError: If the API key is not in the expected format.
        """
        try:
            with open(api_key_path, 'r') as f:
                config = json.load(f)
                self.api_key = config.get('groq_api_key')
                if not self.api_key:
                    raise KeyError("API key not found in config file.")
                self.initialize_client()
                logger.info("API key loaded successfully.")
        except FileNotFoundError:
            logger.error(f"API key file not found at {api_key_path}")
            raise
        except json.JSONDecodeError:
            logger.error(f"Invalid JSON format in {api_key_path}")
            raise
    
    def set_api_key(self, api_key: str) -> None:
        """
        Set the API key directly.
        
        Args:
            api_key: Groq API key.
        """
        self.api_key = api_key
        self.initialize_client()
        logger.info("API key set successfully.")
    
    def initialize_client(self) -> None:
        """Initialize the Groq client with the loaded API key."""
        if self.api_key:
            self.client = Groq(api_key=self.api_key)
            logger.info("Groq client initialized.")
        else:
            logger.warning("No API key available. Set API key before generating speeches.")
    
    def build_prompt(self, topic: str, duration: int, emotion: str = "formal", 
                   audience: str = "general", additional_instructions: str = "") -> str:
        """
        Build a detailed prompt for Groq-based speech generation.
        
        Args:
            topic: The speech topic.
            duration: Speech duration in minutes.
            emotion: Style of the speech.
            audience: Target audience.
            additional_instructions: Any additional instructions for the model.
            
        Returns:
            str: Generated prompt for the LLM.
        """
        base_prompt = self.STYLE_TEMPLATES.get(emotion, self.STYLE_TEMPLATES["formal"]).format(
            topic=topic, duration=duration
        )
        audience_note = self.AUDIENCE_GUIDANCE.get(audience, self.AUDIENCE_GUIDANCE["general"])
        
        # Calculate approximate word count (130 words per minute is average speech rate)
        word_count = duration * 130
        
        final_prompt = (
            f"{base_prompt}\n\n"
            f"{audience_note}\n\n"
            f"Structure the speech with an introduction, body, and conclusion.\n"
            f"Use engaging transitions, rhetorical devices, and paragraph breaks.\n"
            f"Include natural pauses (marked with [pause]) and emphasis points (marked with *emphasis*) to guide the delivery.\n"
            f"Add occasional delivery notes in [brackets] for pacing, tone, or gestures.\n"
            f"Aim for approximately {word_count} words to fill {duration} minutes when delivered aloud.\n"
        )
        
        if additional_instructions:
            final_prompt += f"\nAdditional instructions: {additional_instructions}\n"
        
        return final_prompt
    
    def generate_speech(self, topic: str, duration: int, emotion: str, audience: str, 
                        model: str = "llama3-8b-8192", temperature: float = 0.7, 
                        additional_instructions: str = "") -> Tuple[str, Dict]:
        """
        Generate a speech using the Groq API.
        
        Args:
            topic: The speech topic.
            duration: Speech duration in minutes.
            emotion: Style of the speech.
            audience: Target audience.
            model: The LLM model to use.
            temperature: Creativity parameter (0.0 to 1.0).
            additional_instructions: Additional guidance for the model.
            
        Returns:
            Tuple[str, Dict]: The generated speech text and metadata.
            
        Raises:
            ValueError: If API key is not set.
        """
        if not self.client:
            raise ValueError("API key not set. Use set_api_key() or load_api_key() first.")
        
        # Build prompt based on input
        prompt = self.build_prompt(topic, duration, emotion, audience, additional_instructions)
        
        # Generation metadata
        metadata = {
            "topic": topic,
            "duration": duration,
            "emotion": emotion,
            "audience": audience,
            "model": model,
            "temperature": temperature,
            "timestamp": None,  # Will be added after generation
            "word_count": 0,    # Will be updated after generation
        }
        
        try:
            # Set max tokens based on the model or default to a safe value
            max_tokens = self.AVAILABLE_MODELS.get(model, {}).get("max_tokens", 2048)
            
            # Groq API call
            completion = self.client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=min(max_tokens, 4096),  # Ensure we don't exceed model limits
                top_p=1,
                stream=True
            )
            
            # Collect and display streaming output
            speech = ""
            for chunk in completion:
                chunk_content = chunk.choices[0].delta.content or ""
                speech += chunk_content
                # For UI updates if needed - implement later
            
            # Update metadata
            import datetime
            metadata["timestamp"] = datetime.datetime.now().isoformat()
            metadata["word_count"] = len(speech.split())
            
            # Add to history
            self.history.append(metadata)
            
            return speech, metadata
            
        except Exception as e:
            logger.error(f"API error: {str(e)}")
            raise
    
    def save_speech(self, speech_text: str, metadata: Dict) -> str:
        """
        Save the generated speech to a file.
        
        Args:
            speech_text: The generated speech content.
            metadata: Speech metadata for filename generation.
            
        Returns:
            str: Path to the saved file.
        """
        # Generate filename from topic
        topic_clean = self._sanitize_filename(metadata["topic"])
        audience_clean = self._sanitize_filename(metadata["audience"])
        emotion_clean = self._sanitize_filename(metadata["emotion"])
        
        base_filename = f"speech_{topic_clean}_{emotion_clean}_{audience_clean}"
        filename = f"{base_filename}.txt"
        file_path = os.path.join(self.output_folder, filename)
        
        # Ensure uniqueness
        counter = 1
        while os.path.exists(file_path):
            filename = f"{base_filename}_{counter}.txt"
            file_path = os.path.join(self.output_folder, filename)
            counter += 1
        
        # Save speech with metadata
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(f"# {metadata['topic']}\n")
            f.write(f"# Duration: {metadata['duration']} min | Style: {metadata['emotion']} | Audience: {metadata['audience']}\n")
            f.write(f"# Generated on: {metadata['timestamp']} using {metadata['model']}\n\n")
            f.write(speech_text)
        
        logger.info(f"Speech saved to {file_path}")
        return file_path
    
    def prepare_text_for_tts(self, text: str) -> str:
        """
        Prepare the text for text-to-speech by removing or converting special markers.
        
        Args:
            text: The speech text with special markers.
            
        Returns:
            str: Clean text suitable for TTS processing.
        """
        # Remove delivery notes in brackets
        text = re.sub(r'\[.*?\]', '', text)
        
        # Remove emphasis markers (asterisks) but keep the text
        text = re.sub(r'\*(.*?)\*', r'\1', text)
        
        # Add periods after paragraph breaks to create natural pauses
        text = re.sub(r'\n\n', '.\n\n', text)
        
        # Break into sentences for better TTS chunking
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        return sentences
    
    def generate_speech_audio(self, text: str, voice: str = "en_female_neutral", 
                             output_filename: Optional[str] = None) -> str:
        """
        Generate audio for the speech text using the TTS engine.
        
        Args:
            text: The speech text.
            voice: Voice ID to use.
            output_filename: Optional custom filename for the audio.
            
        Returns:
            str: Path to the generated audio file.
        """
        if not self.tts_engine:
            raise ValueError("TTS engine not available. Check logs for initialization errors.")
        
        # Prepare text by cleaning and chunking
        text_chunks = self.prepare_text_for_tts(text)
        
        # Generate a default filename if not provided
        if not output_filename:
            timestamp = int(time.time())
            output_filename = f"speech_audio_{timestamp}.wav"
        
        output_path = os.path.join(self.audio_folder, output_filename)
        
        try:
            # Process in chunks to avoid TTS limits and create better pacing
            combined_audio = []
            
            # Select voice parameters based on the requested voice
            voice_info = self.TTS_VOICES.get(voice, self.TTS_VOICES["en_female_neutral"])
            
            logger.info(f"Generating audio using {voice_info['description']}")
            
            # Process text in manageable chunks
            for i, chunk in enumerate(text_chunks):
                if not chunk.strip():  # Skip empty chunks
                    continue
                    
                # Generate speech for this chunk
                speaker_embedding = None  # In a real implementation, this would be voice-specific
                
                speech = self.tts_engine(
                    text=chunk,
                    speaker_embeddings=speaker_embedding,
                    return_tensors=True
                )
                
                # Generate audio with the vocoder for better quality
                audio = self.vocoder(speech["generated_speech"])
                
                # Add a small pause between chunks
                if i < len(text_chunks) - 1:
                    # Add 0.5s silence
                    sample_rate = audio["sampling_rate"]
                    silence = torch.zeros(int(0.5 * sample_rate))
                    audio["audio"] = torch.cat([audio["audio"], silence])
                
                combined_audio.append(audio["audio"])
            
            # Combine all audio chunks
            full_audio = torch.cat(combined_audio)
            sample_rate = audio["sampling_rate"]
            
            # Save the combined audio
            torchaudio.save(output_path, full_audio.unsqueeze(0), sample_rate)
            
            logger.info(f"Audio saved to {output_path}")
            return output_path
            
        except Exception as e:
            logger.error(f"Error generating audio: {str(e)}")
            raise
    
    def get_speech_history(self) -> List[Dict]:
        """
        Get the history of generated speeches.
        
        Returns:
            List[Dict]: List of metadata for all generated speeches.
        """
        return self.history
    
    @staticmethod
    def _sanitize_filename(text: str) -> str:
        """
        Convert text to a safe filename.
        
        Args:
            text: Text to sanitize.
            
        Returns:
            str: Sanitized text suitable for filenames.
        """
        # Replace spaces with underscores and remove special characters
        return re.sub(r'[^a-zA-Z0-9_]', '', text.replace(" ", "_"))[:30]  # Limit length


In [13]:
class SpeechGeneratorUI:
    """Class for the Jupyter notebook UI for speech generation."""
    
    def __init__(self, generator: SpeechGenerator):
        """
        Initialize the UI with a SpeechGenerator instance.
        
        Args:
            generator: SpeechGenerator instance.
        """
        self.generator = generator
        self.speech_text = None
        self.speech_metadata = None
        self.audio_path = None
        self.create_widgets()
        
    def create_widgets(self) -> None:
        """Create and display the input widgets."""
        # API Key widget
        self.api_key_input = widgets.Password(
            description="API Key:", 
            placeholder="Enter Groq API key",
            layout=widgets.Layout(width='50%')
        )
        self.api_key_button = widgets.Button(description="Set API Key")
        self.api_key_button.on_click(self._on_api_key_button_click)
        
        # Input widgets
        self.topic_input = widgets.Text(
            value="Recent Trends in AI", 
            description="Topic:",
            layout=widgets.Layout(width='80%')
        )
        
        self.emotion_dropdown = widgets.Dropdown(
            options=list(self.generator.STYLE_TEMPLATES.keys()),
            value="instructional", 
            description="Style:"
        )
        
        self.duration_slider = widgets.IntSlider(
            value=3, min=1, max=15, step=1, 
            description="Duration (min):"
        )
        
        self.audience_dropdown = widgets.Dropdown(
            options=list(self.generator.AUDIENCE_GUIDANCE.keys()),
            value="students", 
            description="Audience:"
        )
        
        self.model_dropdown = widgets.Dropdown(
            options=list(self.generator.AVAILABLE_MODELS.keys()),
            value="llama3-8b-8192", 
            description="Model:"
        )
        
        self.temperature_slider = widgets.FloatSlider(
            value=0.7, min=0.1, max=1.0, step=0.1,
            description="Temperature:"
        )
        
        self.additional_instructions = widgets.Textarea(
            value="", 
            placeholder="Enter any additional instructions here...",
            description="Additional:",
            layout=widgets.Layout(width='80%', height='80px')
        )
        
        # TTS voice selection
        self.voice_dropdown = widgets.Dropdown(
            options=[(f"{v['gender'].capitalize()} {v['style'].capitalize()}", k) for k, v in self.generator.TTS_VOICES.items()],
            value="en_female_neutral", 
            description="Voice:"
        )
        
        # Action buttons
        self.generate_button = widgets.Button(
            description="Generate Speech",
            button_style='primary'
        )
        self.generate_button.on_click(self._on_generate_button_click)
        
        self.save_button = widgets.Button(
            description="Save Text",
            button_style='success',
            disabled=True
        )
        self.save_button.on_click(self._on_save_button_click)
        
        self.audio_button = widgets.Button(
            description="Generate Audio",
            button_style='info',
            disabled=True
        )
        self.audio_button.on_click(self._on_audio_button_click)
        
        self.clear_button = widgets.Button(
            description="Clear Output",
            button_style='warning'
        )
        self.clear_button.on_click(self._on_clear_button_click)
        
        # Output areas
        self.output = widgets.Output()
        self.status_output = widgets.Output()
        self.audio_output = widgets.Output()
        
        # Display widgets
        display(widgets.HTML("<h2>🎤 AI Speech Generator with Audio</h2>"))
        display(widgets.HBox([self.api_key_input, self.api_key_button]))
        
        # Model selection area
        model_box = widgets.VBox([
            widgets.HTML("<h3>Model Settings</h3>"),
            widgets.HBox([self.model_dropdown, self.temperature_slider])
        ])
        
        # Content settings area
        content_box = widgets.VBox([
            widgets.HTML("<h3>Speech Content</h3>"),
            self.topic_input,
            widgets.HBox([self.emotion_dropdown, self.audience_dropdown, self.duration_slider]),
            self.additional_instructions
        ])
        
        # Audio settings
        audio_box = widgets.VBox([
            widgets.HTML("<h3>Audio Settings</h3>"),
            self.voice_dropdown
        ])
        
        # Button area
        button_box = widgets.HBox([
            self.generate_button, self.save_button, self.audio_button, self.clear_button
        ])
        
        display(model_box, content_box, audio_box, button_box, self.status_output, self.output, self.audio_output)
        
        with self.status_output:
            print("Ready to generate speeches. Please set your API key to begin.")
    
    def _on_api_key_button_click(self, button):
        """Handle API key button click."""
        with self.status_output:
            clear_output()
            try:
                self.generator.set_api_key(self.api_key_input.value)
                print("✅ API key set successfully. Ready to generate speeches.")
            except Exception as e:
                print(f"❌ Error setting API key: {str(e)}")
    
    def _on_generate_button_click(self, button):
        """Handle generate button click."""
        with self.status_output:
            clear_output()
            print("🔄 Generating speech... Please wait.")
        
        with self.output:
            clear_output()
            try:
                # Generate speech
                speech_text, metadata = self.generator.generate_speech(
                    topic=self.topic_input.value,
                    duration=self.duration_slider.value,
                    emotion=self.emotion_dropdown.value,
                    audience=self.audience_dropdown.value,
                    model=self.model_dropdown.value,
                    temperature=self.temperature_slider.value,
                    additional_instructions=self.additional_instructions.value
                )
                
                # Store the speech text and metadata for saving later
                self.speech_text = speech_text
                self.speech_metadata = metadata
                
                # Enable save and audio buttons
                self.save_button.disabled = False
                self.audio_button.disabled = False
                
                # Display speech text with metadata
                print(f"# {metadata['topic']}")
                print(f"# Style: {metadata['emotion']} | Audience: {metadata['audience']} | Duration: {metadata['duration']} min")
                print(f"# Word count: {metadata['word_count']} words\n")
                print(speech_text)
                
                # Update status
                with self.status_output:
                    clear_output()
                    print(f"✅ Speech generated successfully with {metadata['word_count']} words (~{metadata['duration']} min).")
                    print("Use the 'Save Text' button to save the text or 'Generate Audio' to create an audio version.")
                
            except Exception as e:
                with self.status_output:
                    clear_output()
                    print(f"❌ Error generating speech: {str(e)}")
    
    def _on_save_button_click(self, button):
        """Handle save button click."""
        try:
            if hasattr(self, 'speech_text') and self.speech_text is not None:
                file_path = self.generator.save_speech(self.speech_text, self.speech_metadata)
                with self.status_output:
                    clear_output()
                    print(f"💾 Speech text saved to: {file_path}")
            else:
                with self.status_output:
                    clear_output()
                    print("❌ No speech generated yet to save.")
        except Exception as e:
            with self.status_output:
                clear_output()
                print(f"❌ Error saving speech: {str(e)}")
    
    def _on_audio_button_click(self, button):
        """Handle audio generation button click."""
        if not self.speech_text:
            with self.status_output:
                clear_output()
                print("❌ Please generate a speech first before creating audio.")
            return
            
        with self.status_output:
            clear_output()
            print("🔊 Generating audio... This may take a moment.")
            
        with self.audio_output:
            clear_output()
            try:
                # Generate a filename from speech metadata
                topic_clean = self.generator._sanitize_filename(self.speech_metadata["topic"])
                voice_id = self.voice_dropdown.value
                voice_name = voice_id.split('_')[1]  # Extract part of voice ID for filename
                
                audio_filename = f"{topic_clean}_{voice_name}.wav"
                
                # Generate audio
                self.audio_path = self.generator.generate_speech_audio(
                    text=self.speech_text,
                    voice=voice_id,
                    output_filename=audio_filename
                )
                
                # Show audio player
                audio = Audio(self.audio_path)
                display(HTML("<h3>Speech Audio Preview</h3>"))
                display(audio)
                
                # Show download link
                display(HTML(f'<a href="{self.audio_path}" download>Download Audio File</a>'))
                
                with self.status_output:
                    clear_output()
                    print(f"✅ Audio generated successfully and saved to: {self.audio_path}")
                    print("Use the audio player above to preview or download the file.")
                    
            except Exception as e:
                with self.status_output:
                    clear_output()
                    print(f"❌ Error generating audio: {str(e)}")
                print(f"Error details: {str(e)}")
    
    def _on_clear_button_click(self, button):
        """Handle clear button click."""
        with self.output:
            clear_output()
        with self.audio_output:
            clear_output()
        with self.status_output:
            clear_output()
            print("Output cleared. Ready for a new speech generation.")


In [14]:
import time

# Create and run the application
def run_speech_generator():
    """Initialize and run the speech generator application."""
    try:
        # Try to load API key from a file, but don't fail if not found
        api_key_path = os.path.join(os.path.expanduser("~"), ".groq_api_key.json")
        if os.path.exists(api_key_path):
            generator = SpeechGenerator(api_key_path)
        else:
            generator = SpeechGenerator()
            
        # Create and display the UI
        ui = SpeechGeneratorUI(generator)
        
    except Exception as e:
        print(f"Error initializing the Speech Generator: {str(e)}")
        print("Please make sure you have all required packages installed:")
        print("pip install groq ipywidgets transformers torchaudio torch")


In [None]:
import sys

# Run the application when this script is executed directly
if __name__ == "__main__" or 'ipykernel' in sys.modules:
    run_speech_generator()

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]