In [3]:
# For Gemini
!pip install -q google-generativeai langchain-google-genai

# # For Hugging Face
!pip install -q huggingface_hub langchain_community

!pip install -q youtube-transcript-api faiss-cpu tiktoken python-dotenv python-docx

In [4]:
import os

# For Gemini
os.environ["GOOGLE_API_KEY"] = "AIzaSyDpTq8_qyP6DytbbKIW7p2hEU_BwWVoXDs"  # Get it from https://ai.google.dev/

# For Hugging Face
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your-hf-token"  # Get it from https://huggingface.co/settings/tokens


In [5]:
# Common imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate


In [6]:
# Gemini imports
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

# Hugging Face imports (uncomment for HF)
# from langchain_community.embeddings import HuggingFaceHubEmbeddings
# from langchain_community.chat_models import ChatHuggingFace


Download YouTube Video Subtitles

We use `yt-dlp` to Automatically download **English (`en`) subtitles** in `.vtt` format

In [7]:
!pip install -q yt-dlp


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
!yt-dlp --write-auto-sub --sub-lang en --skip-download --output "%(id)s.%(ext)s" https://www.youtube.com/watch?v=qJeaCHQ1k2w


[youtube] Extracting URL: https://www.youtube.com/watch?v=qJeaCHQ1k2w
[youtube] qJeaCHQ1k2w: Downloading webpage
[youtube] qJeaCHQ1k2w: Downloading tv client config
[youtube] qJeaCHQ1k2w: Downloading player 010fbc8d-main
[youtube] qJeaCHQ1k2w: Downloading tv player API JSON
[youtube] qJeaCHQ1k2w: Downloading ios player API JSON
[youtube] qJeaCHQ1k2w: Downloading m3u8 information
[info] qJeaCHQ1k2w: Downloading subtitles: en
[info] qJeaCHQ1k2w: Downloading 1 format(s): 399+251
[info] Writing video subtitles to: qJeaCHQ1k2w.en.vtt
[download] Destination: qJeaCHQ1k2w.en.vtt
[K[download] 100% of  167.75KiB in [1;37m00:00:00[0m at [0;32m1.01MiB/s[0m


In [41]:
# Example: Use your chosen LLM (Gemini or Hugging Face) to generate summaries
# Assume `llm` is your language model instance already defined (ChatGoogleGenerativeAI or ChatHuggingFace)

def get_summary(text, llm):
    prompt = f"Please summarize the following text in a concise paragraph:\n\n{text}"
    response = llm.invoke(prompt)
    return response

# Full transcript summary
full_summary = get_summary(transcript, llm)
print("Full Transcript Summary:\n", full_summary)

# Chapterization: Split transcript into chunks by N characters or by time (if available)
chunk_size = 3000  # or number of chars approx. 5 mins
transcript_chunks_str = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size)]

chapter_summaries = []
for idx, chunk in enumerate(transcript_chunks_str):
    summary = get_summary(chunk, llm)
    chapter_summaries.append(summary)
    print(f"Summary of chunk {idx+1}:\n{summary}\n")

Full Transcript Summary:
 content='Generative AI creates new data from scratch, distinguishing it from traditional AI that processes existing data. Variational Autoencoders (VAEs) are a fundamental technique for this, particularly for image generation, addressing the limitations of standard autoencoders whose disorganized latent spaces prevent coherent new outputs. VAEs achieve this by learning a structured, continuous latent space, typically a Gaussian distribution, from which new data can be sampled. An encoder maps input data to the parameters of this distribution, and a decoder reconstructs images from sampled latent points, with training optimized by balancing reconstruction quality and latent space regularization via the reparameterization trick. While VAEs can generate novel images, create variations, and seamlessly blend concepts, their main drawback is often producing blurry outputs compared to more recent models like GANs and diffusion models, though advanced VAE variants are

We use `webvtt` to:
- Read the `.vtt` file
- Extract caption text, start time, and end time
- Create a **full transcript string**
- Store captions as `Document` objects with metadata (timestamps)

In [11]:
# Install webvtt-py if not installed
!pip install -q webvtt-py
from langchain_core.documents import Document

import webvtt

vtt_filename = "qJeaCHQ1k2w.en.vtt" # Corrected filename based on yt-dlp output

transcript_text = []

for caption in webvtt.read(vtt_filename):
    transcript_text.append(caption.text)

transcript = " ".join(transcript_text)
print(transcript[:100])  # print first 1000 chars

captions = []
for caption in webvtt.read(vtt_filename):
    captions.append({
        "text": caption.text,
        "start": caption.start,  # e.g., '00:03:25.100'
        "end": caption.end
    })


chunks = [
    Document(
        page_content=cap["text"],
        metadata={"start": cap["start"], "end": cap["end"]}
    )
    for cap in captions
]

Generative artificial intelligence is a Generative artificial intelligence is a
buzzword you will he


In [39]:
!pip install yt-dlp
video_id = "qJeaCHQ1k2w"
!yt-dlp --write-auto-sub --sub-lang en --skip-download --output "%(id)s.%(ext)s" https://www.youtube.com/watch?v={video_id}


[youtube] Extracting URL: https://www.youtube.com/watch?v=qJeaCHQ1k2w
[youtube] qJeaCHQ1k2w: Downloading webpage
[youtube] qJeaCHQ1k2w: Downloading tv client config
[youtube] qJeaCHQ1k2w: Downloading tv player API JSON
[youtube] qJeaCHQ1k2w: Downloading ios player API JSON
[youtube] qJeaCHQ1k2w: Downloading m3u8 information
[info] qJeaCHQ1k2w: Downloading subtitles: en
[info] Testing format 617
[info] Testing format 234
[info] qJeaCHQ1k2w: Downloading 1 format(s): 617+234
Deleting existing file qJeaCHQ1k2w.en.vtt
[info] Writing video subtitles to: qJeaCHQ1k2w.en.vtt
[download] Destination: qJeaCHQ1k2w.en.vtt
[K[download] 100% of  167.75KiB in [1;37m00:00:00[0m at [0;32m1.87MiB/s[0m


In [14]:
from rag_youtube import build_pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter

pipeline = build_pipeline()
chunks = pipeline["grouped_chunks"]  # Document objects

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(chunks)

print(f"Number of chunks: {len(chunks)}")


Number of chunks: 107


In [16]:
# When splitting, store metadata:
transcript_lines = [] # Initialize the list
for caption in webvtt.read(vtt_filename):
    transcript_lines.append({'text': caption.text, 'start': caption.start, 'end': caption.end})

# When answering, retrieve start/end with the text to cite where info came from!

Generate Embeddings and Store in FAISS

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

Build Retriever and Language Model


In [30]:
from langchain_google_genai import ChatGoogleGenerativeAI

retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

In [21]:
# 1. Define the quiz generator function before the prompt + chain
def generate_quiz_gemini(llm, text, n_questions=5):
    prompt = f"""
You are a creative quiz maker.
From the following context, create {n_questions} multiple choice questions with 4 options each and mark the correct answer.
Format:

Q1. Question?
A. Option 1
B. Option 2
C. Option 3
D. Option 4

Only output the quiz.

Context: {text}
"""
    quiz = llm.invoke(prompt)
    return quiz


In [31]:
quiz_text = generate_quiz_gemini(llm, transcript, n_questions=3)
print(quiz_text.content)


Q1. According to the context, what is the key difference between Generative AI and traditional AI?
A. Generative AI is a buzzword, while traditional AI is not.
B. Generative AI processes existing data, whereas traditional AI creates new data from scratch.
C. Generative AI creates new data from scratch, whereas traditional AI processes existing data.
D. Generative AI is synonymous with large language models like ChatGPT, unlike traditional AI.

Q2. Why do standard autoencoders typically fail to produce useful new data when randomly sampling points from their latent space?
A. The latent space in standard autoencoders is always too small to contain meaningful variations.
B. Standard autoencoders do not have a decoder to reconstruct images from latent representations.
C. The latent space of standard autoencoders is disorganized and irregular, causing large areas to produce meaningless decoded images.
D. Standard autoencoders are only designed for data compression, not generation.

Q3. What

In [32]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough

prompt = PromptTemplate(
    template="""
You are a helpful assistant.
Answer ONLY from the provided transcript context.
If the context is insufficient, just say you don't know.

Answer in {language}.

{context}
Question: {question}
""",
    input_variables=["context", "question","language"]
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the chain to get context by explicitly calling get_relevant_documents
get_context_chain = RunnableLambda(lambda x: x["question"]) | RunnableLambda(lambda x: retriever.get_relevant_documents(x)) | RunnableLambda(format_docs)

# Define the main chain
main_chain = RunnableParallel({
    "context": get_context_chain,
    "question": RunnableLambda(lambda x: x["question"]),
    "language": RunnableLambda(lambda x: x["language"])
}) | prompt | llm | StrOutputParser()

In [33]:
question = "Summarize the main points of the video in detail. Please provide at least 5 bullet points and explain each one."
language='French'
answer = main_chain.invoke({"question": question, "language": language})
print(answer)

Voici les points principaux de la vidéo :

*   **Transformation par l'encodeur :** L'encodeur transforme les données d'entrée en paramètres d'une distribution gaussienne, à savoir la moyenne (mu) et la variance (sigma).
*   **Représentation de l'image dans l'espace latent :** Au lieu d'être représentée comme un point unique dans l'espace latent, une image est représentée comme une distribution gaussienne.
*   **Processus de décodage :** Des points sont échantillonnés aléatoirement à partir de cette distribution latente, et le décodeur reconvertit ces points échantillonnés en images.
*   **Problème de la structure de l'espace latent :** L'espace latent est souvent mal structuré et irrégulier. Cela signifie que des points proches ne correspondent pas à des variations significatives de l'image originale, et de vastes zones de cet espace ne produiront pas d'images décodées significatives.
*   **Conséquence de la mauvaise structure :** Tenter d'échantillonner des points dans le voisinage de

In [34]:
relevant_chunks = retriever.get_relevant_documents(question)
# Each chunk is a Document object with .metadata["start"]

def timestamp_link(start, video_id):
    parts = start.split(':')
    seconds = int(parts[0])*3600 + int(parts[1])*60 + int(float(parts[2]))
    return f"https://www.youtube.com/watch?v={video_id}&t={seconds}s", f"{int(parts[0]):02}:{int(parts[1]):02}:{int(float(parts[2])):02}"

video_id = "qJeaCHQ1k2w"

for chunk in relevant_chunks:
    start_time = chunk.metadata.get("start")
    if start_time is not None:
        ts_url, ts_label = timestamp_link(start_time, video_id)
        print(f"At [{ts_label}]({ts_url}): {chunk.page_content[:120]}...\n")
    else:
        # fallback if no timestamp
        print(f"No timestamp metadata: {chunk.page_content[:120]}...\n")




At [00:11:34](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=694s): Essentially, the encoder transforms the Essentially, the encoder transforms the
input data into the parameters of this i...

At [00:02:21](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=141s): The latent space is so poorly structured The latent space is so poorly structured
that even nearby points often don't th...

At [00:01:59](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=119s): results, mainly because the latent space results, mainly because the latent space
is disorganized and irregular. So, lar...

At [00:18:22](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=1102s): explore each of these models in future explore each of these models in future
videos, so stay tuned if you want to video...



In [25]:
from langchain_core.documents import Document

grouped_chunks = []
group = []
char_count = 0
max_chars = 1000  # set your preferred chunk size in characters

for cap in captions:
    group.append(cap)
    char_count += len(cap["text"])
    if char_count >= max_chars:
        # Form one Document chunk with combined text and time range
        grouped_chunks.append(
            Document(
                page_content=" ".join(c["text"] for c in group),
                metadata={
                    "start": group[0]["start"],  # Start of first caption in group
                    "end": group[-1]["end"]      # End of last caption in group
                }
            )
        )
        group = []
        char_count = 0

# If any leftovers at the end, add them as a chunk too
if group:
    grouped_chunks.append(
        Document(
            page_content=" ".join(c["text"] for c in group),
            metadata={
                "start": group[0]["start"],
                "end": group[-1]["end"]
            }
        )
    )

# Use grouped_chunks for vector store creation:
vector_store = FAISS.from_documents(grouped_chunks, embeddings)

In [26]:
for c in grouped_chunks[:3]:
    print(c.metadata)


{'start': '00:00:02.070', 'end': '00:00:21.439'}
{'start': '00:00:21.439', 'end': '00:00:45.270'}
{'start': '00:00:45.270', 'end': '00:01:18.149'}


In [27]:
vector_store = FAISS.from_documents(grouped_chunks, embeddings)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [28]:
def search_transcript(query, retriever, video_id):
    results = retriever.get_relevant_documents(query)
    snippets = []
    for doc in results:
        start_time = doc.metadata.get("start", None)
        if start_time:
            # Helper: Convert 'hh:mm:ss.mmm' to YouTube link/timestamp text
            parts = start_time.split(':')
            seconds = int(parts[0])*3600 + int(parts[1])*60 + int(float(parts[2]))
            ts_url = f"https://www.youtube.com/watch?v={video_id}&t={seconds}s"
            ts_label = f"{int(parts[0]):02}:{int(parts[1]):02}:{int(float(parts[2])):02}"
            snippet = f"At [{ts_label}]({ts_url}): {doc.page_content[:180].replace(chr(10), ' ')}..."
        else:
            snippet = doc.page_content[:180].replace(chr(10), " ") + "..."
        snippets.append(snippet)
    return snippets

# Usage example in a notebook cell. (Place after retriever/vector store setup.)
query = "autoencoder"
video_id = "qJeaCHQ1k2w"  # Your video ID
snippets = search_transcript(query, retriever, video_id)
for s in snippets:
    print(s)


  results = retriever.get_relevant_documents(query)


At [00:01:36](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=96s): where we cover the basics of where we cover the basics of autoenccoders. autoenccoders. autoenccoders. Now, if we have a trained autoenccoder Now, if we have a trained autoenccoder...
At [00:11:11](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=671s): and then decodes it back into the input and then decodes it back into the input space. In a standard autoenccoder, this space. In a standard autoenccoder, this space. In a standard...
At [00:10:49](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=649s): distribution shape on our latent space. Now that we've got the theory down, Now that we've got the theory down, Now that we've got the theory down, let's move on to the practicalit...
At [00:00:45](https://www.youtube.com/watch?v=qJeaCHQ1k2w&t=45s): in modern architectures such as stable in modern architectures such as stable diffusion. diffusion. diffusion. But why do we even need such models? Why But why do we even need such..

In [29]:
!pip install fpdf
from fpdf import FPDF

def export_text_to_pdf(text, filename="output.pdf"):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    pdf.output(filename)
    print(f"Saved PDF as {filename}")

# Example usage:
export_text_to_pdf(transcript, "transcript.pdf")


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=16f2175c9064a7803e7c7470d1a5522fe57936ff82d0bc2b17558409aabb9883
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Saved PDF as transcript.pdf


In [35]:
from docx import Document as DocxDoc

def export_text_to_docx(text, filename="output.docx"):
    doc = DocxDoc()
    doc.add_paragraph(text)
    doc.save(filename)
    print(f"Saved DOCX as {filename}")

# Usage: export_text_to_docx(full_summary, "summary.docx")


In [42]:
!pip install gtts
from gtts import gTTS
import IPython.display as ipd

def text_to_speech(text, filename="output.mp3", lang="en"):
    tts = gTTS(text=text, lang=lang)
    tts.save(filename)
    print(f"Audio saved as {filename}")
    return ipd.Audio(filename)

# Example usage after getting answer from Gemini:
audio = text_to_speech(full_summary.content)  # Change text as needed
audio  # Plays inline in notebook

# For user-selected languages, set lang argument accordingly (e.g., 'hi' for Hindi, 'fr' for French)

Audio saved as output.mp3


In [43]:
# Install Streamlit and other requirements
!pip install streamlit langchain-google-genai langchain-core langchain-community webvtt-py fpdf python-docx gtts faiss-cpu

# Upload your app.py and rag_youtube.py to Colab
from google.colab import files
files.upload()  # Choose both app.py and rag_youtube.py

# Create a public tunnel for Streamlit
!pip install pyngrok
from pyngrok import ngrok

# Run Streamlit app
public_url = ngrok.connect(8501)
print(f"Streamlit App URL: {public_url}")
!streamlit run app.py --server.port 8501


Collecting streamlit
  Downloading streamlit-1.48.0-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInst

Saving app.py to app (1).py
Saving Rag_youtube.ipynb to Rag_youtube (1).ipynb
Saving rag_youtube.py to rag_youtube (1).py
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


ERROR:pyngrok.process.ngrok:t=2025-08-10T17:04:52+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-10T17:04:52+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-10T17:04:52+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

In [44]:
!pip install pyngrok
from pyngrok import ngrok

# Replace YOUR_AUTHTOKEN with the one from your ngrok dashboard
!ngrok config add-authtoken 2zDpOzvEL9Re7FuJm536UwTMjjU_2nckXRZgdR3gPRHP1p9w9

public_url = ngrok.connect(8501)
print(f"Streamlit App URL: {public_url}")
!streamlit run app.py --server.port 8501


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit App URL: NgrokTunnel: "https://a4f3c0685eca.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.172.51.165:8501[0m
[0m
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
  RunnableLambda(lambda x: retriever.get_relevant_documents(x["question"])) |
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
Saved PDF as transcript.pdf
[34m  Stopping...[0m
[34m  Stopping...[0m
