# Import Libraries

In [1]:
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [2]:
load_dotenv()
embeddings = OpenAIEmbeddings()

# Get Transcription from Youtube Url

In [3]:
def get_transcript_text(video_url: str, languages=['en']) -> str:
    video_id = video_url.split("v=")[-1]
    ytt = YouTubeTranscriptApi()
    transcript_list = ytt.list(video_id)
    transcript = transcript_list.find_transcript(languages)
    fetched = transcript.fetch()
    return " ".join([snippet.text for snippet in fetched])

In [4]:
video_url = "https://www.youtube.com/watch?v=Pqb19IaWwOE"
print(get_transcript_text(video_url))

Now we are at 480 trillion... Is there still time to sleep? When do you get up, when do you go to sleep? I think first of all the education system 
needs to adapt... Can we still trust our eyes while scrolling through the internet? Okay let's be honest, what has been happening in the world of tech over the last few years is absolutely insane. Smartphones are turning into smart glasses we are seeing tech startups pop out of nowhere that either fail completely or go on to change the entire world. Artificial intelligence is THE tech topic for years now and I got to talk all about it with this man right here, Sunda Pichai, the CEO of Google  he told me what he really thinks about AI answering questions like  Are we about to 
be replaced in our jobs? Or how is this actually going to work out here on social media?  And a quick heads up for my German speaking community:  I've decided to make this entire video in English, but you can turn on the German subtitles  that I've added for you right 

# Create FAISS Vector Database

In [5]:
def create_db_from_youtube_video_url(video_url: str) -> FAISS:

    transcript_text = get_transcript_text(video_url)

    docs = [Document(page_content=transcript_text, metadata={"source": video_url})]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_documents(docs)

    db = FAISS.from_documents(split_docs, embeddings)
    return db

In [11]:
db = create_db_from_youtube_video_url(video_url)
print("FAISS DB Summary:")
print(f"- Total chunks: {len(db.docstore._dict)}")
print(f"- Metadata example: {next(iter(db.docstore._dict.values())).metadata}")

FAISS DB Summary:
- Total chunks: 17
- Metadata example: {'source': 'https://www.youtube.com/watch?v=Pqb19IaWwOE'}


# Run Q&A on Transcript

In [8]:
def get_response_from_query(db, query, k=4):

    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

    prompt = ChatPromptTemplate.from_template("""
    You are a helpful assistant that can answer questions about YouTube videos 
    based on the video's transcript.

    Question: {question}
    Transcript: {docs}

    Give a short, concise answer in 1-2 sentences only.
    Only use factual information from the transcript.
    If the transcript doesn't have enough info, say "I don't know".
    """)

    chain = prompt | llm

    result = chain.invoke({"question": query, "docs": docs_page_content})
    return result.content.strip()

In [9]:
query = "What are the main topics discussed in the video?"
response = get_response_from_query(db, query)

print(f"Question: {query}")
print(f"Answer: { response}")

Question: What are the main topics discussed in the video?
Answer: The main topics discussed in the video are the implications of technology on education, the role of artificial intelligence, the importance of journalism, and the future of personalized learning.
