In [10]:
# imports

from IPython.display import Markdown, display, update_display
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
from youtube_transcript_api import YouTubeTranscriptApi
import gradio as gr
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from pathlib import Path

In [4]:
pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install llama-index

Collecting llama-index
  Downloading llama_index-0.12.26-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.1 (from llama-index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.26 (from llama-index)
  Downloading llama_index_core-0.12.27-py3-none-any.whl.metadata (2.6 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.9-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.29-py3-none-any.whl.metadata (3.3 kB)
Collec

In [20]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embed_model = OpenAIEmbedding()
Settings.embed_model = embed_model

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key exists and begins sk-proj-
Google API Key exists and begins AIzaSyAt


In [24]:
def fetch_youtube_subtitles(video_url, language='en'):
    try:
        video_id = video_url.split('v=')[1].split('&')[0]
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        subtitles = '\n'.join([entry['text'] for entry in transcript])
        return subtitles

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [16]:
video_url = 'https://www.youtube.com/watch?v=Q7Abm5BBZMM'
subtitles = fetch_youtube_subtitles(video_url)
print(subtitles)

hey Al Peter Zion coming to you from
Colorado you are getting this message so
that you can be informed on April 9
we're doing something called question
time which is basically when our patreon
members can Grill me live about whatever
is going on in the world and the primary
topic for the event this April 9 is
going to be Trump's tariffs which come
out the week before and their effect on
the American and the global economy
moving forward so sign up for our
patreon system now join the fun bring
your question and we'll see you April 9
at noon Eastern if you can't make it
that's okay you can get a recording as
long as you sign up hey everybody Peter
Z here coming to you from the home
office apologize for being inside but
there's 70 mph winds outside and
recording is just not possible uh today
is the 17th of March and the news is
that American defense secretary Pete
hegi just cancelled defense talks with
the South Koreans uh he had a really
good reason for doing it the South
Koreans functio

In [25]:
def load_podcast_index(transcript_text):
    """Load and process podcast transcript data from provided text."""
    try:
        # Create a Document object from the transcript text
        doc = Document(
            text=transcript_text,
            metadata={"title": "Podcast Transcript"}
        )
        
        # Create and return index from document with sentence splitter
        parser = SentenceSplitter(chunk_size=512, chunk_overlap=50)
        return VectorStoreIndex.from_documents(
            [doc],
            embed_model=embed_model,
            transformations=[parser]
        )
    
    except Exception as e:
        print(f"Error loading podcast transcript: {str(e)}")
        return None

In [26]:
def query_podcast(query, transcript_text, chat_history=None):
    """Query the podcast transcript based on user input."""
    try:
        # Load the index using the provided transcript text
        index = load_podcast_index(transcript_text)
        if not index:
            return "Error: Could not load podcast transcript data"
        
        # Configure retriever for better context
        retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=5,
        )
        
        # Create query engine with custom retriever
        query_engine = RetrieverQueryEngine.from_args(retriever=retriever)
        
        # Query and get response
        response = query_engine.query(query)
        
        # Return formatted response
        return str(response)
    
    except Exception as e:
        return f"Error processing query: {str(e)}"

In [30]:
import gradio as gr

# Define example questions
example_questions = [
    ["What were the main topics discussed?"],
    ["Give me the key takeaways or insights in dot points"],
    ["Who were the hosts, guests or organisations in the podcast or video?"],
    ["Were any significant statistics or data points mentioned?"],
    ["Can you list any resources or references if mentioned?"]
]

with gr.Blocks() as app:
    gr.Markdown("# TL;DListen: Podcast Transcript Chat")

    # YouTube URL input
    youtube_url = gr.Textbox(
        label="YouTube Video URL",
        placeholder="Enter the YouTube video URL here..."
    )

    # Instructional text below the YouTube URL input
    gr.Markdown("*enter the YouTube video URL above before asking questions*")

    # Chat interface
    chatbot = gr.Chatbot(
        label="Conversation",
        height=400,
        type='messages'
    )

    # Query input
    msg = gr.Textbox(
        label="Ask about the podcast",
        placeholder="Ask a question about the podcast content...",
        lines=2
    )

    # Submit button
    submit_btn = gr.Button("Send")

    # Example questions
    gr.Examples(
        examples=example_questions,
        inputs=msg,
        label="Example Questions"
    )

    # Chat history state
    chat_state = gr.State([])

    def respond(youtube_url, message, chat_history):
        """Process user message and update chat history"""
        if not youtube_url.strip():
            return "Error: Please enter a YouTube video URL.", chat_history
        if not message.strip():
            return "", chat_history

        # Extract subtitles from the provided YouTube URL
        subtitles = extract_subtitles(youtube_url)
        if "Error" in subtitles:
            return subtitles, chat_history

        # Get response from query engine
        bot_response = query_podcast(message, subtitles, chat_history)

        # Update chat history
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": bot_response})

        return "", chat_history

    # Set up interactions
    submit_btn.click(
        respond,
        inputs=[youtube_url, msg, chat_state],
        outputs=[msg, chatbot]
    )

    msg.submit(
        respond,
        inputs=[youtube_url, msg, chat_state],
        outputs=[msg, chatbot]
    )

    gr.Markdown("""
    ### Tips:
    - Ask specific questions about the podcast content
    - Try using the example questions provided above
    - You can request summaries of specific segments
    """)

if __name__ == "__main__":
    app.launch(share=True)


* Running on local URL:  http://127.0.0.1:7876
* Running on public URL: https://fd9b9269a13964cf76.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
