<a href="https://colab.research.google.com/github/tonykipkemboi/research-paper-to-podcast/blob/main/research_paper_to_podcast_crew.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Research Paper to Podcast Generator

This notebook converts research papers into engaging podcast conversations using AI Agents. Follow the steps below to generate your podcast!

## Setup
First, we'll install the required dependencies and set up our environment.

In [None]:
%pip install --quiet crewai crewai-tools elevenlabs python-dotenv pydub pydantic

# Mount Google Drive to access your PDF files (optional)
from google.colab import drive
drive.mount('/content/drive')

## Environment Variables
You'll need to set up your API keys. Create them at:
- ElevenLabs: https://elevenlabs.io
- Serper Dev: https://serper.dev
- OpenAI: https://platform.openai.com
- Anthropic: https://www.anthropic.com

In [None]:
import os
from google.colab import userdata

# Set your API keys here
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['ELEVENLABS_API_KEY'] = userdata.get('ELEVENLABS_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')
os.environ['CEREBRAS_API_KEY'] = userdata.get('CEREBRAS_API_KEY')
os.environ['SERPER_API_KEY'] = userdata.get('SERPER_API_KEY')

# Voice IDs from ElevenLabs
os.environ['BEN_VOICE_ID'] = userdata.get('BEN_VOICE_ID')
os.environ['CLAUDIA_VOICE_ID'] = userdata.get('CLAUDIA_VOICE_ID')

## Tools for our Agents to use

In [None]:
import os
from typing import Dict, List, Optional, Any, Type
from datetime import datetime
from pydub import AudioSegment
from crewai.tools import BaseTool
from pydantic import Field, BaseModel, ConfigDict
from elevenlabs.client import ElevenLabs

In [None]:
class VoiceConfig(BaseModel):
    """Voice configuration settings."""
    stability: float = 0.45  # Slightly lower for more natural variation
    similarity_boost: float = 0.85  # Higher to maintain consistent voice character
    style: float = 0.65  # Balanced expressiveness
    use_speaker_boost: bool = True
    model_id: str = "eleven_multilingual_v2"
    output_format: str = "mp3_44100_128"
    apply_text_normalization: str = "auto"  # 'auto', 'on', or 'off'

class AudioConfig(BaseModel):
    """Audio processing configuration."""
    format: str = "mp3"
    sample_rate: int = 48000  # Higher for better quality
    channels: int = 2
    bitrate: str = "256k"     # Higher bitrate for clearer audio
    normalize: bool = True    # Normalize audio levels
    target_loudness: float = -14.0  # Standard podcast loudness (LUFS)
    compression_ratio: float = 2.0   # Light compression for voice

class Dialogue(BaseModel):
    """Dialogue for the podcast audio generation tool."""
    speaker: str
    text: str

class PodcastAudioGeneratorInput(BaseModel):
    """Input for the podcast audio generation tool."""
    dialogue: List[Dialogue]

In [None]:
class PodcastAudioGenerator(BaseTool):
    """Enhanced podcast audio generation tool."""

    name: str = "PodcastAudioGenerator"
    description: str = "Synthesizes podcast voices using ElevenLabs API."

    model_config = ConfigDict(arbitrary_types_allowed=True)

    api_key: str = Field(default_factory=lambda: os.getenv("ELEVENLABS_API_KEY"))
    voice_configs: Dict[str, Dict] = Field(default_factory=dict)
    audio_config: AudioConfig = Field(default_factory=AudioConfig)
    output_dir: str = Field(default="output/audio-files")
    client: Any = Field(default=None)
    args_schema: Type[BaseModel] = PodcastAudioGeneratorInput

    def __init__(self, **data):
        super().__init__(**data)
        if not self.api_key:
            raise ValueError("ELEVENLABS_API_KEY environment variable not set")
        self.client = ElevenLabs(api_key=self.api_key)

    def add_voice(self, name: str, voice_id: str, config: Optional[VoiceConfig] = None) -> None:
        """Add a voice configuration."""
        self.voice_configs[name] = {
            "voice_id": voice_id,
            "config": config or VoiceConfig()
        }

    def _run(self, dialogue: List[Dialogue]) -> List[str]:
        """Generate audio files for each script segment."""
        os.makedirs(self.output_dir, exist_ok=True)

        audio_files = []
        for index, segment in enumerate(dialogue):
            speaker = segment.get('speaker', '').strip()
            text = segment.get('text', '').strip()

            if not speaker or not text:
                print(f"Skipping segment {index}: missing speaker or text")
                continue

            voice_config = self.voice_configs.get(speaker)
            if not voice_config:
                print(f"Skipping unknown speaker: {speaker}")
                continue

            try:
                audio_generator = self.client.text_to_speech.convert(
                    text=text,
                    voice_id=voice_config["voice_id"],
                    model_id=voice_config['config'].model_id,
                    output_format=voice_config['config'].output_format,
                    voice_settings={
                        "stability": voice_config['config'].stability,
                        "similarity_boost": voice_config['config'].similarity_boost,
                        "style": voice_config['config'].style,
                        "use_speaker_boost": voice_config['config'].use_speaker_boost
                    }
                )

                # Convert generator to bytes
                audio_bytes = b''.join(chunk for chunk in audio_generator)

                filename = f"{self.output_dir}/{index:03d}_{speaker}.{self.audio_config.format}"
                with open(filename, "wb") as out:
                    out.write(audio_bytes)

                # Basic audio normalization
                if self.audio_config.normalize:
                    audio = AudioSegment.from_file(filename)
                    normalized = audio.normalize()  # Simple normalization
                    normalized = normalized + 4  # Slight boost

                    # Use context manager to ensure file is closed
                    with normalized.export(
                        filename,
                        format=self.audio_config.format,
                        bitrate=self.audio_config.bitrate,
                        parameters=["-ar", str(self.audio_config.sample_rate)]
                    ) as f:
                        f.close()

                audio_files.append(filename)
                print(f'Audio content written to file "{filename}"')

            except Exception as e:
                print(f"Error processing segment {index}: {str(e)}")
                continue

        return sorted(audio_files)

In [None]:
class PodcastMixer(BaseTool):
    """Enhanced audio mixing tool for podcast production."""

    name: str = "PodcastMixer"
    description: str = "Mixes multiple audio files with effects into final podcast."

    audio_config: AudioConfig = Field(default_factory=AudioConfig)
    output_dir: str = Field(default="output/podcast")

    def _run(
        self,
        audio_files: List[str],
        crossfade: int = 50
    ) -> str:
        if not audio_files:
            raise ValueError("No audio files provided to mix")

        try:
            # Create output directory if it doesn't exist
            os.makedirs(self.output_dir, exist_ok=True)

            mixed = AudioSegment.from_file(audio_files[0])
            for audio_file in audio_files[1:]:
                next_segment = AudioSegment.from_file(audio_file)
                # Add silence and use crossfade
                silence = AudioSegment.silent(duration=200)
                next_segment = silence + next_segment
                mixed = mixed.append(next_segment, crossfade=crossfade)

            # Simplified output path handling
            output_file = os.path.join(self.output_dir, "podcast_final.mp3")

            mixed.export(
                output_file,
                format="mp3",
                parameters=[
                    "-q:a", "0",  # Highest quality
                    "-ar", "48000"  # Professional sample rate
                ]
            )

            print(f"Successfully mixed podcast to: {output_file}")
            return output_file

        except Exception as e:
            print(f"Error mixing podcast: {str(e)}")
            return ""

## Setup Output Directory Structure

In [None]:
def setup_directories():
    """Set up organized directory structure"""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    dirs = {
        'BASE': f'outputs/{timestamp}',
        'SEGMENTS': f'outputs/{timestamp}/segments',
        'FINAL': f'outputs/{timestamp}/podcast',
        'DATA': f'outputs/{timestamp}/data'
    }

    for directory in dirs.values():
        os.makedirs(directory, exist_ok=True)

    return dirs

## Upload your PDF

Use this cell to upload your research paper PDF or any PDF.



In [None]:
import os
import shutil
from google.colab import files

# Create the 'knowledge' folder if it doesn't exist
if not os.path.exists('knowledge'):
    os.makedirs('knowledge')

# Upload the PDF file
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

# Move the uploaded file to the 'knowledge' folder
shutil.move(pdf_filename, os.path.join('knowledge', pdf_filename))

## Setup Agents & Tasks


In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
from crewai_tools import SerperDevTool
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime

In [None]:
# Pass only the filename to PDFKnowledgeSource
research_paper = PDFKnowledgeSource(file_paths=pdf_filename)

In [None]:
# --- Pydantic Models definitions ---
class PaperSummary(BaseModel):
    """Summary of a research paper."""
    title: str = Field(..., description="Title of the research paper")
    main_findings: List[str] = Field(..., description="Key findings as a list of strings")
    methodology: str = Field(..., description="Research methodology as a single text block")
    key_implications: List[str] = Field(..., description="Implications as a list of strings")
    limitations: List[str] = Field(..., description="Limitations as a list of strings")
    future_work: List[str] = Field(..., description="Future research directions as a list")
    summary_date: datetime = Field(..., description="Timestamp of summary creation")

class DialogueLine(BaseModel):
    """Dialogue line for a podcast script."""
    speaker: str = Field(..., description="Name of the speaker (Julia or Guido)")
    text: str = Field(..., description="The actual dialogue line")

class PodcastScript(BaseModel):
    """Podcast script with dialogue lines."""
    dialogue: List[DialogueLine] = Field(..., description="Ordered list of dialogue lines")

class AudioGeneration(BaseModel):
    """Audio generation result with metadata."""
    segment_files: List[str] = Field(..., description="List of generated audio segment files")
    final_podcast: str = Field(..., description="Path to the final mixed podcast file")

## Configure all the LLMs Agents will Use

In [None]:
# --- LLM Setup ---
summary_llm = LLM(
    model="openai/o1-preview",
    temperature=0.0,
)

script_llm = LLM(
    model="openai/o1-preview",
    temperature=0.3,
)

script_enhancer_llm = LLM(
    model="anthropic/claude-3-5-sonnet-20241022",
    temperature=0.7,
)

audio_llm = LLM(
    model="cerebras/llama3.3-70b",
    temperature=0.0,
)

## Setup ElevenLabs Podcast Hosts Voices

In [None]:
# Create and configure tools
dirs = setup_directories()
audio_generator = PodcastAudioGenerator(output_dir=dirs['SEGMENTS'])

# Julia: Enthusiastic expert
audio_generator.add_voice(
    "Julia",
    os.getenv("CLAUDIA_VOICE_ID"),
    VoiceConfig(
        stability=0.35,  # More variation for natural enthusiasm
        similarity_boost=0.75,  # Maintain voice consistency
        style=0.65,  # Good expressiveness without being over the top
        use_speaker_boost=True
    )
)

# Guido: Engaged and curious
audio_generator.add_voice(
    "Guido",
    os.getenv("BEN_VOICE_ID"),
    VoiceConfig(
        stability=0.4,  # Slightly more stable but still natural
        similarity_boost=0.75,
        style=0.6,  # Balanced expressiveness
        use_speaker_boost=True
    )
)

podcast_mixer = PodcastMixer(output_dir=dirs['FINAL'])
search_tool = SerperDevTool()

## Agents

In [None]:
# --- Agents ---
researcher = Agent(
    role="Research Analyst",
    goal="Create comprehensive yet accessible research paper summaries",
    backstory="""You're a PhD researcher with a talent for breaking down complex
    academic papers into clear, understandable summaries. You excel at identifying
    key findings and their real-world implications.""",
    verbose=True,
    llm=audio_llm
)

research_support = Agent(
    role="Research Support Specialist",
    goal="Find current context and supporting materials relevant to the paper's topic",
    backstory="""You're a versatile research assistant who excels at finding
    supplementary information across academic fields. You have a talent for
    connecting academic research with real-world applications, current events,
    and practical examples, regardless of the field. You know how to find
    credible sources and relevant discussions across various domains.""",
    verbose=True,
    tools=[search_tool],
    llm=audio_llm
)

script_writer = Agent(
    role="Podcast Script Writer",
    goal="Create engaging and educational podcast scripts about technical topics",
    backstory="""You're a skilled podcast writer who specializes in making technical
    content engaging and accessible. You create natural dialogue between two hosts:
    Julia (a knowledgeable expert who explains concepts clearly) and Guido (an informed
    co-host who asks thoughtful questions and helps guide the discussion).""",
    verbose=True,
    llm=script_llm
)

script_enhancer = Agent(
    role="Podcast Script Enhancer",
    goal="Enhance podcast scripts to be more engaging while maintaining educational value",
    backstory="""You're a veteran podcast producer who specializes in making technical
    content both entertaining and informative. You excel at adding natural humor,
    relatable analogies, and engaging banter while ensuring the core technical content
    remains accurate and valuable. You've worked on shows like Lex Fridman's podcast,
    Hardcore History, and the Joe Rogan Experience, bringing their signature blend of
    entertainment and education.""",
    verbose=True,
    llm=script_llm
)

audio_generator_agent = Agent(
    role="Audio Generation Specialist",
    goal="Generate high-quality podcast audio with natural-sounding voices",
    backstory="""You are an expert in audio generation and processing. You understand
    how to generate natural-sounding voices and create professional podcast audio. You
    consider pacing, tone, and audio quality in your productions.""",
    verbose=True,
    allow_delegation=False,
    tools=[audio_generator, podcast_mixer],
    llm=audio_llm
)

## Tasks

In [None]:
# --- Tasks ---
summary_task = Task(
    description="""Read and analyze the provided research paper: {paper}.

    Create a comprehensive summary that includes:
    1. Main findings and conclusions
    2. Methodology overview
    3. Key implications for the field
    4. Limitations of the study
    5. Suggested future research directions

    Make the summary accessible to an educated general audience while maintaining accuracy.""",
    expected_output="A structured summary of the research paper with all key components.",
    agent=researcher,
    output_pydantic=PaperSummary,
    output_file="output/metadata/paper_summary.json"
)

supporting_research_task = Task(
    description="""After analyzing the paper summary, find recent and relevant supporting
    materials that add context and real-world perspective to the topic.

    Research Approach:
    1. Topic Analysis:
        • Identify key themes and concepts from the paper
        • Determine related fields and applications
        • Note any specific claims or findings to verify

    2. Current Context:
        • Recent developments in the field
        • Latest practical applications
        • Industry or field-specific news
        • Related ongoing research

    3. Supporting Evidence:
        • Academic discussions and debates
        • Industry reports and white papers
        • Professional forum discussions
        • Conference presentations
        • Expert opinions and analyses

    4. Real-world Impact:
        • Practical implementations
        • Case studies
        • Success stories or challenges
        • Market or field adoption

    5. Different Perspectives:
        • Alternative approaches
        • Critical viewpoints
        • Cross-disciplinary applications
        • Regional or cultural variations

    Focus on finding information that:
    • Is recent (preferably within last 2 years)
    • Comes from credible sources
    • Adds valuable context to the paper's topic
    • Provides concrete examples or applications
    • Offers different viewpoints or approaches""",
    expected_output="A structured collection of relevant supporting materials and examples",
    agent=research_support,
    context=[summary_task],
    output_file="output/metadata/supporting_research.json"
)

podcast_task = Task(
    description="""Using the paper summary and supporting research, create an engaging and informative podcast conversation
    between Julia and Guido. Make it feel natural while clearly distinguishing between paper findings and supplementary research.

    Source Attribution Guidelines:
    • For Paper Content:
        - "According to the paper..."
        - "The researchers found that..."
        - "In their study, they discovered..."
        - "The paper's methodology showed..."

    • For Supporting Research:
        - "I recently read about..."
        - "There's some interesting related work by..."
        - "This reminds me of a recent case study..."
        - "Building on this, other researchers have found..."

    Host Dynamics:
    - Julia: A knowledgeable but relatable expert who:
        • Explains technical concepts with enthusiasm
        • Sometimes playfully challenges Guido's assumptions
        • Clearly distinguishes between paper findings and broader context
        • Occasionally plays devil's advocate on certain points
        • Admits when she's uncertain about specific aspects
        • Shares relevant personal experiences with AI and tech
        • Can connect the research to broader tech trends
        • Uses casual expressions and shows genuine excitement

    - Guido: An engaged and curious co-host who:
        • Asks insightful questions and follows interesting threads
        • Occasionally disagrees based on his practical experience
        • Brings up relevant external examples and research
        • Respectfully pushes back on theoretical claims with real-world examples
        • Helps find middle ground in discussions
        • Helps make connections to practical applications
        • Naturally guides the conversation back to main topics

    Example Flow with Attribution:
    Julia: "The paper's findings show that RAG is superior for factual queries."
    Guido: "That's interesting, because I recently read about a case study where..."
    Julia: "Oh, that's a great point! While the researchers found X, these real-world examples show Y..."

    Disagreement Guidelines:
    • Keep disagreements friendly and constructive
    • Use phrases like:
        - "I see what the paper suggests, but in practice..."
        - "While the study found X, other research shows..."
        - "That's an interesting finding, though recent developments suggest..."
    • Always find common ground or learning points
    • Use disagreements to explore nuances
    • Resolve differences with mutual understanding

    Conversation Flow:
    1. Core Discussion: Focus on the research and findings
    2. Natural Tangents with Clear Attribution:
        • "Building on the paper's findings..."
        • "This relates to some recent developments..."
        • "While not covered in the paper, there's interesting work on..."
    3. Smooth Returns: Natural ways to bring the conversation back:
        • "Coming back to what the researchers found..."
        • "This actually connects to the paper's methodology..."
        • "That's a great example of what the study was trying to solve..."

    Writing Guidelines:
    1. Clearly distinguish paper findings from supplementary research
    2. Use attribution phrases naturally within the conversation
    3. Connect different sources of information meaningfully
    4. Keep technical content accurate but conversational
    5. Maintain engagement through relatable stories
    6. Include occasional friendly disagreements
    7. Show how different perspectives and sources enrich understanding

    Note: Convey reactions through natural language rather than explicit markers like *laughs*.""",
    expected_output="A well-balanced podcast script that clearly distinguishes between paper content and supplementary research.",
    agent=script_writer,
    context=[summary_task, supporting_research_task],
    output_pydantic=PodcastScript,
    output_file="output/metadata/podcast_script.json"
)

enhance_script_task = Task(
    description="""Take the initial podcast script and enhance it to be more engaging
    and conversational while maintaining its educational value.

    IMPORTANT RULES:
    1. NEVER change the host names - always keep Julia and Guido exactly as they are
    2. NEVER add explicit reaction markers like *chuckles*, *laughs*, etc.
    3. NEVER add new hosts or characters

    Enhancement Guidelines:
    1. Add Natural Elements:
        • Include natural verbal reactions ("Oh that's fascinating", "Wow", etc.)
        • Keep all dialogue between Julia and Guido only
        • Add relevant personal anecdotes or examples that fit their established roles:
            - Julia as the knowledgeable expert
            - Guido as the engaged and curious co-host
        • Express reactions through words rather than action markers

    2. Improve Flow:
        • Ensure smooth transitions between topics
        • Add brief casual exchanges that feel natural
        • Include moments of reflection or connection-making
        • Balance technical depth with accessibility

    3. Maintain Quality:
        • Keep all technical information accurate
        • Ensure added content supports rather than distracts
        • Preserve the core findings and insights
        • Keep the overall length reasonable

    4. Add Engagement Techniques:
        • Include thought-provoking analogies by both hosts
        • Add relatable real-world examples
        • Express enthusiasm through natural dialogue
        • Include collaborative problem-solving moments
        • Inject humor where appropriate and it has to be funny

    Natural Reaction Examples:
    ✓ RIGHT: "Oh, that's fascinating!"
    ✓ RIGHT: "Wait, that doesn't make sense!"
    ✓ RIGHT: "Wait, really? I hadn't thought of it that way."
    ✓ RIGHT: "That's such a great point."
    ✗ WRONG: *chuckles* or *laughs* or any other action markers
    ✗ WRONG: Adding new speakers or changing host names

    The goal is to make the content feel like a conversation between Julia and Guido
    who are genuinely excited about the topic, while ensuring listeners learn
    something valuable.""",
    expected_output="An enhanced version of the podcast script that's more engaging and natural",
    agent=script_enhancer,
    context=[summary_task, podcast_task],
    output_pydantic=PodcastScript,
    output_file="output/metadata/enhanced_podcast_script.json"
)

audio_task = Task(
    description="""Generate high-quality audio for the podcast script and create the final podcast.

    The script will be provided in the context as a list of dialogue entries, each with:
    - speaker: Either "Julia" or "Guido"
    - text: The line to be spoken

    Process:
    1. Generate natural-sounding audio for each line of dialogue using appropriate voices
    2. Apply audio processing for professional quality:
       - Normalize audio levels
       - Add subtle fade effects between segments
       - Apply appropriate pacing and pauses
    3. Mix all segments into a cohesive final podcast

    Voice Assignments:
    - For Julia's lines: Use configured Julia voice
    - For Guido's lines: Use configured Guido voice

    Quality Guidelines:
    - Ensure consistent audio levels across all segments
    - Maintain natural pacing and flow
    - Create smooth transitions between speakers
    - Verify audio clarity and quality""",
    expected_output="A professional-quality podcast audio file with natural-sounding voices and smooth transitions",
    agent=audio_generator_agent,
    context=[enhance_script_task],
    output_pydantic=AudioGeneration,
    output_file="output/metadata/audio_generation_meta.json"
)

## Put the Agent Crew Together

In [None]:
# --- Crew and Process ---
crew = Crew(
    agents=[researcher, research_support, script_writer, script_enhancer, audio_generator_agent],
    tasks=[summary_task, supporting_research_task, podcast_task, enhance_script_task, audio_task],
    process=Process.sequential,
    knowledge_sources=[research_paper],
    verbose=True
)

## Run

In [None]:
if __name__ == "__main__":
    # Update task output files
    summary_task.output_file = os.path.join(dirs['DATA'], "paper_summary.json")
    supporting_research_task.output_file = os.path.join(dirs['DATA'], "supporting_research.json")
    podcast_task.output_file = os.path.join(dirs['DATA'], "podcast_script.json")
    enhance_script_task.output_file = os.path.join(dirs['DATA'], "enhanced_podcast_script.json")
    audio_task.output_file = os.path.join(dirs['DATA'], "audio_generation_meta.json")

    # Run the podcast generation process
    results = crew.kickoff()