# AI Podcast Generator - Two Hosts Conversation

This notebook generates podcasts with two AI hosts having a natural conversation about any topic you provide.

**Features:**
- Uses Mistral 7B for intelligent conversation generation
- VibeVoice for real-time text-to-speech
- Two distinct voices for each host
- Cloudflared for public access
- Beautiful Gradio UI

**Requirements:** T4 GPU or better

## Step 1: Setup Environment & Install Dependencies

In [None]:
# Check for GPU
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"\u2705 GPU detected: {gpu_name}")
    if "T4" not in gpu_name and "A100" not in gpu_name and "V100" not in gpu_name:
        print("\u26a0\ufe0f Warning: For best performance, use T4 GPU or better")
else:
    print("""
    \u26a0\ufe0f WARNING: No GPU detected!
    
    To enable GPU:
    1. Click 'Runtime' > 'Change runtime type'
    2. Select 'T4 GPU'
    3. Click 'Save'
    """)

# Clone VibeVoice repository
![ -d /content/VibeVoice ] || git clone --quiet --branch main --depth 1 https://github.com/microsoft/VibeVoice.git /content/VibeVoice
print("\u2705 Cloned VibeVoice repository")

# Install dependencies
!pip install -q transformers accelerate bitsandbytes gradio scipy
!pip install -q uv 2>/dev/null || true
!uv pip install --system -e /content/VibeVoice 2>/dev/null || pip install -q -e /content/VibeVoice
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared && chmod +x cloudflared
print("\u2705 Installed dependencies")

## Step 2: Download Models

In [None]:
from huggingface_hub import snapshot_download

# Download VibeVoice TTS model
print("Downloading VibeVoice TTS model...")
snapshot_download("microsoft/VibeVoice-Realtime-0.5B", local_dir="/content/models/VibeVoice-Realtime-0.5B")
print("\u2705 Downloaded VibeVoice TTS model")

# Load Mistral 7B for conversation
print("\nLoading Mistral 7B for conversation generation...")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Use 4-bit quantization to fit in T4 GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("\u2705 Loaded Mistral 7B model")

## Step 3: Setup VibeVoice TTS

In [None]:
import sys
import os
import numpy as np
from scipy.io import wavfile
from pathlib import Path

# Add VibeVoice to path
sys.path.insert(0, '/content/VibeVoice')

# Import the correct model and processor classes
from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor

print("Loading VibeVoice TTS model...")
tts_model_path = "/content/models/VibeVoice-Realtime-0.5B"

# Load the processor
tts_processor = VibeVoiceStreamingProcessor.from_pretrained(tts_model_path)

# Load the model
try:
    tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
        tts_model_path,
        torch_dtype=torch.bfloat16,
        device_map="cuda",
        attn_implementation="flash_attention_2",
    )
except Exception as e:
    print(f"Flash attention not available, falling back to SDPA: {e}")
    tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
        tts_model_path,
        torch_dtype=torch.bfloat16,
        device_map="cuda",
        attn_implementation="sdpa",
    )

tts_model.eval()

# Configure the noise scheduler
tts_model.model.noise_scheduler = tts_model.model.noise_scheduler.from_config(
    tts_model.model.noise_scheduler.config,
    algorithm_type="sde-dpmsolver++",
    beta_schedule="squaredcos_cap_v2",
)
tts_model.set_ddpm_inference_steps(num_steps=5)

print("\u2705 Loaded VibeVoice TTS model")

# Find available voices
voice_dir = Path("/content/VibeVoice/demo/voices/streaming_model")
voice_presets = {}
if voice_dir.exists():
    for pt_path in voice_dir.glob("*.pt"):
        voice_presets[pt_path.stem] = pt_path
    print(f"\u2705 Found {len(voice_presets)} voices: {list(voice_presets.keys())}")
else:
    print("\u26a0\ufe0f Voice directory not found")

# Select two different voices for hosts
voice_list = list(voice_presets.keys())
HOST1_VOICE = voice_list[0] if len(voice_list) > 0 else "en-Carter_man"
HOST2_VOICE = voice_list[1] if len(voice_list) > 1 else HOST1_VOICE

# Try to pick a female voice for variety if available
for v in voice_list:
    if "woman" in v.lower() or "emma" in v.lower() or "grace" in v.lower():
        HOST2_VOICE = v
        break

print(f"\nHost 1 (Alex) voice: {HOST1_VOICE}")
print(f"Host 2 (Sam) voice: {HOST2_VOICE}")

# Cache loaded voice presets
voice_cache = {}

## Step 4: Define Core Functions

In [None]:
import copy
import threading
from vibevoice.modular.streamer import AudioStreamer

def load_voice_preset(voice_name):
    """Load and cache a voice preset."""
    if voice_name in voice_cache:
        return voice_cache[voice_name]
    
    if voice_name not in voice_presets:
        print(f"Voice {voice_name} not found, using {HOST1_VOICE}")
        voice_name = HOST1_VOICE
    
    voice_path = voice_presets[voice_name]
    prefilled_outputs = torch.load(voice_path, map_location="cuda", weights_only=False)
    voice_cache[voice_name] = prefilled_outputs
    return prefilled_outputs


def generate_conversation(topic, num_exchanges=5):
    """Generate a podcast conversation between two hosts about a topic."""
    
    system_prompt = f"""Write a natural podcast conversation between two hosts about: {topic}

Host 1 is named Alex - enthusiastic, curious, asks great questions.
Host 2 is named Sam - knowledgeable, provides insights, shares anecdotes.

Write exactly {num_exchanges} back-and-forth exchanges.
Each line must start with either ALEX: or SAM: followed by their actual dialogue.
Do NOT include any placeholders, brackets, or stage directions.
Just write what they actually say out loud.
Keep each response to 2-3 sentences maximum.
Make it engaging and conversational.
"""

    messages = [{"role": "user", "content": system_prompt}]
    inputs = llm_tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = llm_model.generate(
            inputs,
            max_new_tokens=1024,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=llm_tokenizer.eos_token_id
        )
    
    response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse the conversation
    lines = response.split('\n')
    conversation = []
    
    for line in lines:
        line = line.strip()
        if line.startswith('ALEX:'):
            text = line.replace('ALEX:', '').strip()
            # Skip placeholder lines or empty lines
            if text and '[' not in text and text.lower() != 'dialogue':
                conversation.append(('Alex', text, HOST1_VOICE))
        elif line.startswith('SAM:'):
            text = line.replace('SAM:', '').strip()
            # Skip placeholder lines or empty lines
            if text and '[' not in text and text.lower() != 'dialogue':
                conversation.append(('Sam', text, HOST2_VOICE))
    
    return conversation


def clean_text_for_tts(text):
    """Clean text before sending to TTS - remove parenthetical stage directions."""
    import re
    # Remove stage directions like (laughs), (sighs), etc.
    text = re.sub(r'\([^)]*\)', '', text)
    # Clean up extra spaces
    text = ' '.join(text.split())
    return text.strip()


def text_to_speech(text, voice_name):
    """Convert text to speech using VibeVoice."""
    # Clean the text first
    text = clean_text_for_tts(text)
    
    if not text or not text.strip():
        return None, None
    
    try:
        # Load voice preset
        prefilled_outputs = load_voice_preset(voice_name)
        
        # Clean text
        text = text.strip().replace("'", "'")
        
        # Prepare inputs using the processor
        processed = tts_processor.process_input_with_cached_prompt(
            text=text,
            cached_prompt=prefilled_outputs,
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )
        
        # Move to device
        inputs = {k: v.to("cuda") if hasattr(v, 'to') else v for k, v in processed.items()}
        
        # Setup streaming
        audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
        stop_event = threading.Event()
        errors = []
        
        def run_generation():
            try:
                tts_model.generate(
                    **inputs,
                    max_new_tokens=None,
                    cfg_scale=1.5,
                    tokenizer=tts_processor.tokenizer,
                    generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
                    audio_streamer=audio_streamer,
                    stop_check_fn=stop_event.is_set,
                    verbose=False,
                    refresh_negative=True,
                    all_prefilled_outputs=copy.deepcopy(prefilled_outputs),
                )
            except Exception as e:
                errors.append(e)
                import traceback
                traceback.print_exc()
            finally:
                audio_streamer.end()
        
        # Start generation in thread
        gen_thread = threading.Thread(target=run_generation, daemon=True)
        gen_thread.start()
        
        # Collect audio chunks
        all_audio = []
        try:
            stream = audio_streamer.get_stream(0)
            for audio_chunk in stream:
                if torch.is_tensor(audio_chunk):
                    audio_chunk = audio_chunk.detach().cpu().to(torch.float32).numpy()
                else:
                    audio_chunk = np.asarray(audio_chunk, dtype=np.float32)
                
                if audio_chunk.ndim > 1:
                    audio_chunk = audio_chunk.reshape(-1)
                
                # Normalize chunk
                peak = np.max(np.abs(audio_chunk)) if audio_chunk.size else 0.0
                if peak > 1.0:
                    audio_chunk = audio_chunk / peak
                
                all_audio.append(audio_chunk.astype(np.float32))
        finally:
            stop_event.set()
            audio_streamer.end()
            gen_thread.join(timeout=30)
        
        if errors:
            print(f"Generation error: {errors[0]}")
            return None, None
        
        if all_audio:
            audio = np.concatenate(all_audio)
            return audio, 24000
        else:
            print("No audio chunks collected")
            return None, None
            
    except Exception as e:
        print(f"TTS Error: {e}")
        import traceback
        traceback.print_exc()
        return None, None


print("\u2705 Core functions defined")


## Step 5: Test TTS (Optional)

In [None]:
# Test TTS with a simple phrase
print("Testing TTS with Host 1 voice...")
test_audio, sr = text_to_speech("Hello, welcome to our podcast!", HOST1_VOICE)

if test_audio is not None:
    print(f"\u2705 TTS working! Audio length: {len(test_audio)/sr:.2f} seconds")
    # Play audio in Colab
    from IPython.display import Audio, display
    display(Audio(test_audio, rate=sr))
else:
    print("\u274c TTS test failed")

## Step 6: Launch Podcast Generator UI

In [None]:
import gradio as gr
import subprocess
import threading
import re
import time

def generate_podcast(topic, num_exchanges, host1_voice, host2_voice, progress=gr.Progress()):
    """Main function to generate the podcast."""
    
    progress(0, desc="Generating conversation script...")
    
    # Generate the conversation with selected voices
    conversation = generate_conversation_with_voices(topic, int(num_exchanges), host1_voice, host2_voice)
    
    if not conversation:
        return "Failed to generate conversation. Please try again.", None
    
    # Display the script
    script = "\n\n".join([f"**{speaker}:** {text}" for speaker, text, _ in conversation])
    
    progress(0.2, desc="Generating audio...")
    
    # Generate audio for each line
    all_audio = []
    sample_rate = 24000
    
    for i, (speaker, text, voice) in enumerate(conversation):
        progress((0.2 + 0.7 * (i / len(conversation))), desc=f"Generating audio for {speaker} ({i+1}/{len(conversation)})...")
        
        audio, sr = text_to_speech(text, voice)
        if audio is not None:
            all_audio.append(audio)
            sample_rate = sr
            # Add pause between speakers (0.5 seconds)
            pause = np.zeros(int(sample_rate * 0.5))
            all_audio.append(pause)
    
    if all_audio:
        progress(0.95, desc="Finalizing...")
        
        # Combine all audio
        combined_audio = np.concatenate(all_audio)
        
        # Normalize
        if np.abs(combined_audio).max() > 0:
            combined_audio = combined_audio / np.abs(combined_audio).max() * 0.9
        
        # Return as tuple for Gradio Audio (sample_rate, audio_array)
        progress(1.0, desc="Done!")
        return f"**Podcast Generated!**\n\n---\n\n{script}", (sample_rate, combined_audio)
    else:
        return f"Audio generation failed.\n\n---\n\n{script}", None


def generate_conversation_with_voices(topic, num_exchanges, host1_voice, host2_voice):
    """Generate conversation using the selected voices."""
    system_prompt = f"""You are a podcast script writer. Write a complete podcast episode script about: {topic}

The podcast has two hosts:
- ALEX: The main host who introduces the show, welcomes listeners, and asks questions
- SAM: The co-host who provides expert insights and explanations

IMPORTANT STRUCTURE:
1. ALEX must START by welcoming listeners to the podcast and introducing today's topic
2. They discuss the topic naturally for {num_exchanges} exchanges total
3. ALEX must END by thanking Sam and the listeners, and signing off

FORMAT RULES:
- Each line starts with ALEX: or SAM: followed by what they say
- Write ONLY their spoken words, no stage directions or brackets
- Keep each turn to 1-3 sentences
- Make it sound natural and conversational

Example opening:
ALEX: Hey everyone, welcome back to the show! Today we're diving into something really exciting.
SAM: That's right, and I can't wait to break this down for our listeners.

Now write the complete podcast script about: {topic}
"""

    messages = [{"role": "user", "content": system_prompt}]
    inputs = llm_tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = llm_model.generate(
            inputs,
            max_new_tokens=1500,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            pad_token_id=llm_tokenizer.eos_token_id
        )
    
    response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    lines = response.split('\n')
    conversation = []
    
    for line in lines:
        line = line.strip()
        if line.startswith('ALEX:'):
            text = line.replace('ALEX:', '').strip()
            if text and '[' not in text and text.lower() != 'dialogue':
                conversation.append(('Alex', text, host1_voice))
        elif line.startswith('SAM:'):
            text = line.replace('SAM:', '').strip()
            if text and '[' not in text and text.lower() != 'dialogue':
                conversation.append(('Sam', text, host2_voice))
    
    return conversation


# Get voice options
voice_options = list(voice_presets.keys()) if voice_presets else ["en-Carter_man"]

# Create Gradio Interface
with gr.Blocks(title="AI Podcast Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # AI Podcast Generator
    
    Generate engaging podcast conversations between two AI hosts!
    Enter a topic, pick voices for each host, and create your podcast.
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            topic_input = gr.Textbox(
                label="Podcast Topic",
                placeholder="e.g., 'The future of AI', 'Why coffee is amazing', 'Space exploration'",
                lines=3
            )
            
            num_exchanges = gr.Slider(
                minimum=2, maximum=10, value=4, step=1,
                label="Number of Exchanges",
                info="How many back-and-forth exchanges between hosts"
            )
            
            gr.Markdown("### Voice Selection")
            
            with gr.Row():
                host1_voice = gr.Dropdown(
                    choices=voice_options,
                    value=HOST1_VOICE,
                    label="Alex's Voice (Host 1)",
                    info="Curious, enthusiastic host"
                )
                host2_voice = gr.Dropdown(
                    choices=voice_options,
                    value=HOST2_VOICE,
                    label="Sam's Voice (Host 2)", 
                    info="Knowledgeable, insightful host"
                )
            
            generate_btn = gr.Button("Generate Podcast", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Podcast",
                type="numpy",
                interactive=False,
                autoplay=False
            )
            script_output = gr.Markdown(label="Conversation Script")
    
    gr.Markdown(f"*Powered by Mistral 7B + VibeVoice TTS*")
    
    generate_btn.click(
        fn=generate_podcast, 
        inputs=[topic_input, num_exchanges, host1_voice, host2_voice], 
        outputs=[script_output, audio_output]
    )

print("Starting server...")

# Launch Gradio
threading.Thread(target=lambda: demo.launch(server_name="0.0.0.0", server_port=7860, share=False, quiet=True), daemon=True).start()
time.sleep(3)

# Start cloudflared tunnel
cf = subprocess.Popen(
    "./cloudflared tunnel --url http://localhost:7860 --no-autoupdate",
    shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
)

url_pattern = re.compile(r"(https://[a-z0-9-]+\.trycloudflare\.com)")
print("Looking for public URL...")

for line in cf.stdout:
    m = url_pattern.search(line)
    if m:
        print(f"\nYour Podcast Generator is live at:\n\n{m.group(1)}\n")
        break

print("--- Press Stop to end ---")
while True:
    time.sleep(1)
