In [35]:
# imports

from IPython.display import Markdown, display, update_display
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
from youtube_transcript_api import YouTubeTranscriptApi
import gradio as gr
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from pathlib import Path

In [36]:
pip install youtube-transcript-api

Note: you may need to restart the kernel to use updated packages.


In [37]:
pip install llama-index

Note: you may need to restart the kernel to use updated packages.


In [38]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embed_model = OpenAIEmbedding()
Settings.embed_model = embed_model

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key exists and begins sk-proj-
Google API Key exists and begins AIzaSyAt


In [65]:
def fetch_youtube_subtitles(video_url, language='en'):
    try:
        video_id = video_url.split('v=')[1].split('&')[0]
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        subtitles = '\n'.join([entry['text'] for entry in transcript])
        return subtitles

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [59]:
def load_podcast_index(transcript_text):
    """Load and process podcast transcript data from provided text."""
    try:
        # Create a Document object from the transcript text
        doc = Document(
            text=transcript_text,
            metadata={"title": "Podcast Transcript"}
        )
        
        # Create and return index from document with sentence splitter
        parser = SentenceSplitter(chunk_size=384, chunk_overlap=72)
        return VectorStoreIndex.from_documents(
            [doc],
            embed_model=embed_model,
            transformations=[parser]
        )
    
    except Exception as e:
        print(f"Error loading podcast transcript: {str(e)}")
        return None

In [69]:
def query_podcast(query, transcript_text, chat_history=None):
    """Query the podcast transcript and generate a detailed response using an LLM."""
    try:
        index = load_podcast_index(transcript_text)
        if not index:
            return "Error: Could not load podcast transcript data"

        # Increase the number of retrieved chunks for more context
        retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=8,  # Increased from 5 to 8
        )

        query_engine = RetrieverQueryEngine.from_args(retriever=retriever)

        # Perform retrieval
        retrieval_results = retriever.retrieve(query)
        context_texts = [result.node.get_content() for result in retrieval_results]

        # --- More Detailed System Prompt ---
        system_prompt = """You are a highly skilled AI assistant specialized in providing detailed and comprehensive summaries of podcast transcripts. Pay close attention to all aspects of the discussion, including main topics, sub-topics, arguments, evidence, examples, speaker identities, and any conclusions. Your goal is to provide thorough and insightful responses that capture the depth and breadth of the podcast content relevant to the user's query."""

        # --- User Prompt with Enhanced Instructions ---
        user_prompt = f"""Please provide a detailed and comprehensive answer to the following question based on the provided excerpts from the podcast transcript:

{' '.join(context_texts)}

Question: {query}

In your answer, please ensure you:
- Clearly identify and explain all relevant main topics and sub-topics.
- Detail the key arguments and reasoning presented by the speakers.
- Include any supporting evidence, examples, statistics, or anecdotes mentioned.
- Attribute specific points or opinions to the speakers if possible.
- Highlight any nuances, disagreements, or points of emphasis.
- Summarize any significant conclusions or takeaways from the discussion related to the question.

Aim for a response that is informative, thorough, and demonstrates a deep understanding of the podcast content."""

        client = OpenAI()
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-16k",  # Consider gpt-4 for even better quality if you have access
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            stream=False
        )

        bot_response = response.choices[0].message.content
        return bot_response

    except Exception as e:
        return f"Error processing query: {str(e)}"

In [70]:
import gradio as gr

# Define example questions
example_questions = [
    ["What were the main topics discussed?"],
    ["Give me the key takeaways or insights in dot points"],
    ["Who were the hosts, guests or organisations in the podcast or video?"],
    ["Were any significant statistics or data points mentioned?"],
    ["Can you list any resources or references if mentioned?"],
    ["Do you have any criticism of the discussions or topics covered?"]
]

with gr.Blocks() as app:
    gr.Markdown("# TL;DListen: Podcast Transcript Chat")

    # YouTube URL input
    youtube_url = gr.Textbox(
        label="YouTube Video URL",
        placeholder="Enter the YouTube video URL here..."
    )

    # Instructional text below the YouTube URL input
    gr.Markdown("*enter the YouTube video URL above before asking questions*")

    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
                label="Ask about the podcast",
                placeholder="Ask a question about the podcast content...",
                lines=2
            )
            submit_btn = gr.Button("Send")
            clear_btn = gr.Button("Clear All")  # Added Clear All button
            gr.Examples(
                examples=example_questions,
                inputs=msg,
                label="Example Questions"
            )
            gr.Markdown("""
            ### Tips:
            - Ask specific questions about the podcast content
            - Try using the example questions provided above
            - You can request summaries of specific segments
            """)
        chatbot = gr.Chatbot(
            label="Conversation",
            height=600,
            type='messages'
        )

    # Chat history state
    chat_state = gr.State([])

    def respond(youtube_url, message, chat_history):
        """Process user message and update chat history"""
        if not youtube_url.strip():
            return "Error: Please enter a YouTube video URL.", chat_history
        if not message.strip():
            return "", chat_history

        subtitles = fetch_youtube_subtitles(youtube_url)
        if "Error" in subtitles:
            return subtitles, chat_history

        bot_response = query_podcast(message, subtitles, chat_history)

        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": bot_response})

        return "", chat_history

    def clear_all(youtube_url, chat_history):
        """Clears the conversation, history, and YouTube URL."""
        return "", [], []

    # Set up interactions
    submit_btn.click(
        respond,
        inputs=[youtube_url, msg, chat_state],
        outputs=[msg, chatbot]
    )

    msg.submit(
        respond,
        inputs=[youtube_url, msg, chat_state],
        outputs=[msg, chatbot]
    )

    clear_btn.click(
        clear_all,
        inputs=[youtube_url, chat_state],
        outputs=[youtube_url, chatbot, chat_state]
    )

if __name__ == "__main__":
    app.launch(share=True)

* Running on local URL:  http://127.0.0.1:7909
* Running on public URL: https://91fbad71f323fa3b32.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
