In [107]:
# Q&A with Youtube Transcript using LangChain and HuggingFace

# Starting with a MVP (Minimum Viable Product) - Essential features only [Enhancemebts to be added later]

# Steps to be followed:
# 1. Importing the required libraries
# 2. Writing a function to fetch transcript from a given Youtube URL
# 3. Setting up the language model and embedding model using HuggingFace API
# 4. Text Splitting - Splitting the transcript into manageable chunks
# 5. Creating a vector store to hold the embeddings
# 6. Setting up the Retriever
# 7. Creating a chain
# 8. Creatin an UI using Streamlit

In [108]:
# 1. Importing the required libraries



In [109]:
# 2. Writing a function to fetch transcript from a given Youtube URL

from youtube_transcript_api import YouTubeTranscriptApi

def fetch_youtube_transcript(video_id: str) -> str:

    ytt_api = YouTubeTranscriptApi()
    fetched_transcript = ytt_api.fetch(video_id,
                  languages=['en'])  # Specify the language of the transcript, default is English. Should try to translate it to English at a later stage if transcript is not available in English

    # Converting the fetched transcript into a single string
    transcript = " ".join([snippet.text for snippet in fetched_transcript])
    return transcript

video_id = "zxQyTK8quyY" # Replace with your YouTube video ID, Write a function later to extract the video ID from the URL

transcript = fetch_youtube_transcript(video_id)

# Checking the length of the transcript
print(f"Length of the transcript: {len(transcript)} characters")

# Printing the first and last 100 characters of the transcript
print(f"First 100 characters of the transcript: {transcript[:100]}")
print(f"Last 100 characters of the transcript: {transcript[-100:]}")

Length of the transcript: 28785 characters
First 100 characters of the transcript: [Music] translation it's done with a transform ER stat Quest hello I'm Josh starmer and welcome to s
Last 100 characters of the transcript: t or a hoodie or just donate the links are in the description below alright until next time Quest on


In [110]:
# 3. Setting up the language model and embedding model using HuggingFace API

from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEndpointEmbeddings
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from a .env file

# Setting up the language model
llm = HuggingFaceEndpoint(
    repo_id='deepseek-ai/DeepSeek-V3.1',
    task="text-generation"
)

model = ChatHuggingFace(llm=llm)

# Checking the model
# result = model.invoke("What is the capital of India")

# print(result.content)

# Setting up the embedding model
embedding_model = HuggingFaceEndpointEmbeddings(
    repo_id='sentence-transformers/all-MiniLM-L6-v2',
    task="feature-extraction"
)
# # Checking the embedding model
# embedding = embedding_model.embed_query("Hello, how are you?")
# print(f"Embedding length: {len(embedding)}")
# print(f"Embedding: {embedding}")

In [111]:
# 4. Text Splitting - Splitting the transcript into manageable chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", " ", ""])

chunks = text_splitter.create_documents([transcript])

# print(len(texts))
print(f"Number of chunks: {len(chunks)}")
print(f"First few chunks: {chunks[0]}")
print(f"{chunks[1]}")
print(f"{chunks[2]}")

Number of chunks: 34
First few chunks: page_content='[Music] translation it's done with a transform ER stat Quest hello I'm Josh starmer and welcome to statquest today we're going to talk about Transformer neural networks and they're going to be clearly explained Transformers are more fun when you build them in the cloud with lightning bam right now people are going bonkers about something called chat GPT for example our friend statsquatch might type something into chat GPT like right and awesome song in the style of statquest translation it's done with a transform ER anyway there's a lot to be said about how chat GPT works but fundamentally it is based on something called a Transformer so in this stat Quest we're going to show you how a Transformer works one step at a time specifically we're going to focus on how a Transformer neural network can translate a simple English sentence let's go into Spanish vamos now since a Transformer is a type of neural network and neural networks usual

In [112]:
# 5. Creating a vector store to hold the embeddings
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunks, embedding_model)
print(f"Number of vectors in the vector store: {vectorstore.index.ntotal}")

# First vector shape
print(f"First vector shape: {vectorstore.index.reconstruct(0).shape}")
# First vector
print(f"First vector: {vectorstore.index.reconstruct(0)}")


Number of vectors in the vector store: 34
First vector shape: (384,)
First vector: [-1.39453992e-01 -5.08732013e-02  2.45066546e-02 -5.01772389e-02
 -4.73994203e-02  1.66481528e-02  8.81869271e-02  5.52349910e-02
  1.00835539e-01 -3.35111506e-02 -3.68931983e-03  6.09022416e-02
 -2.07618438e-02  1.50899962e-02  2.31710915e-02 -2.51774266e-02
 -2.41745561e-02  2.98863649e-02 -9.63017568e-02 -4.59887348e-02
  9.84680057e-02  1.63164362e-02 -2.84353625e-02  3.84797715e-02
  8.52074549e-02  5.33244908e-02 -2.03639455e-03 -3.83457989e-02
  2.58153630e-03 -2.88394317e-02 -4.28568535e-02  5.61851449e-02
 -2.67245974e-02  6.48040995e-02 -1.64009333e-01  2.52955537e-02
 -3.29103023e-02 -5.35792187e-02 -5.45955934e-02  2.41573937e-02
 -1.77636202e-02 -6.43353909e-02  4.56412062e-02 -3.78277414e-02
 -9.98049509e-04 -2.37403624e-02 -5.18258959e-02 -3.74019369e-02
 -5.49683049e-02  1.54513046e-02 -5.85036948e-02 -5.82067594e-02
 -2.87286509e-02  1.48616314e-01 -2.43626963e-02  9.22958255e-02
  2.285

In [113]:
# 6. Setting up the Retriever using multi-query retrieval with Maximal Marginal Relevance (MMR) as base-retriever

from langchain.retrievers import MultiQueryRetriever

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5})


multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever,
    llm=model)

# Checking the retriever on a sample query
multiquery_retriever.invoke("Most important points of the video?")


[Document(id='577f29f9-41e7-4f9d-952d-78e8d94dce8f', metadata={}, page_content="neural networks with hidden layers to both the encoder and decoder bam now it's time for some Shameless self-promotion if you want to review statistics and machine learning offline check out the statquest PDF study guides in my book the stat Quest Illustrated guide to machine learning at stackwest.org there's something for everyone hooray we've made it to the end of another exciting stat Quest if you like this stack Quest and want to see more please subscribe and if you want to support stackquest consider contributing to my patreon campaign becoming a channel member buying one or two of my original songs or a t-shirt or a hoodie or just donate the links are in the description below alright until next time Quest on"),
 Document(id='62af0744-20b3-4cbc-91dc-32992e276013', metadata={}, page_content="to pizza or potentially it could refer to the word oven Josh I've heard of good tasting pizza but never a good ta

In [114]:
# 7. Creating a chain by combining the retriever and the language model

from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""
You are a knowledgeable assistant answering questions about a YouTube video.

Use ONLY the information from the transcript context below.
- If the context does not contain the answer, reply with: "I don't know based on the transcript."
- Do NOT use outside knowledge.
- Prefer concise, factual answers.
- If multiple relevant points exist, summarize them in bullet points.
- If transcript timestamps are available in the context, include them in your answer.

---
Transcript context:
{context}
---
Question: {question}

""",
    input_variables=['context', 'question']
)

In [115]:
question = "is the topic of attention discussed in this video? if yes then what was discussed"
retrieved_docs = retriever.invoke(question)
retrieved_docs

[Document(id='7d89403a-4fa9-449e-aafa-f95f8675730d', metadata={}, page_content="of how words are related within a sentence however since We're translating a sentence we also need to keep track of the relationships between the input sentence and the output for example if the input sentence was don't eat the delicious looking and smelling pizza then when translating it's super important to keep track of the very first word don't if the translation focuses on other parts of the sentence and omits the don't then we'll end up with eat the delicious looking and smelling Pizza and these two sentences have completely opposite meanings so it's super important for the decoder to keep track of the significant words in the input so the main idea of encoder decoder attention is to allow the decoder to keep track of the significant words in the input now that we know the main idea behind encoder decoder attention here are the details first to give us a little more room let's consolidate the math and

In [116]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"of how words are related within a sentence however since We're translating a sentence we also need to keep track of the relationships between the input sentence and the output for example if the input sentence was don't eat the delicious looking and smelling pizza then when translating it's super important to keep track of the very first word don't if the translation focuses on other parts of the sentence and omits the don't then we'll end up with eat the delicious looking and smelling Pizza and these two sentences have completely opposite meanings so it's super important for the decoder to keep track of the significant words in the input so the main idea of encoder decoder attention is to allow the decoder to keep track of the significant words in the input now that we know the main idea behind encoder decoder attention here are the details first to give us a little more room let's consolidate the math and the diagrams now just like we did for self-attention we create two new values\

In [119]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

parallel_chain = RunnableParallel({
    'context': multiquery_retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

parser = StrOutputParser()

main_chain = parallel_chain | prompt | model | parser

result = main_chain.invoke('Can you summarize the video for a 15 year old?')

In [120]:
from IPython.display import Markdown
Markdown(result)

Based on the transcript, here's a summary for a 15-year-old:

This video explains how Transformer neural networks work, focusing on how they can translate a simple English sentence into Spanish.

*   **The Core Idea:** It introduces Transformers as the fundamental technology behind things like ChatGPT.
*   **Step-by-Step Translation:** The video breaks down the translation process into steps, explaining how a neural network (which only understands numbers) handles words.
*   **Key Concepts:**
    *   **Positional Encoding:** How the Transformer keeps track of the order of words in a sentence using sine and cosine "squiggles" to give each word a unique position value.
    *   **Self-Attention:** A mechanism that helps the Transformer understand the context of a sentence. For example, it calculates how similar each word is to all the others to correctly figure out what the word "it" refers to (like associating "it" with "pizza" and not "oven").
    *   **Encoder-Decoder Attention:** This allows the part of the network generating the translation (the decoder) to keep track of and focus on the most important words from the original input sentence to ensure the meaning isn't lost (like remembering the word "don't").
*   **Extra Details:** The video also mentions that real-world Transformers are more complex, using techniques like normalizing values and scaling calculations to handle long and complicated sentences.

In [122]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Downloading protobuf-6.32.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Using cached pyarrow-21.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
C

In [123]:
# 8. Creatin an UI using Streamlit to interact with the chatbot

import streamlit as st

st.title("YouTube Video Q&A Chatbot")
question = st.text_input("Ask a question about the video:")
if question:
    result = main_chain.invoke(question)
    st.markdown(result)
    





2025-09-15 21:15:02.787 
  command:

    streamlit run c:\Users\Daman\anaconda3\envs\yt_env\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-09-15 21:15:02.792 Session state does not function when running a script without `streamlit run`
