# This section:

# *1. Loads the saved embedding model, FAISS index, and transcript metadata,*

# *2. Encodes a user-provided question into a vector and retrieves the most semantically similar transcript chunks,*

# *3. Formats the results with video timestamps and clickable YouTube links for direct navigation to relevant segments.*


#### This part enables semantic question-answering from processed transcripts. It loads the previously built FAISS index and metadata, then allows the user to input a natural language question. The question is turned into a vector and compared against the transcript chunk embeddings to find the most semantically similar answers.The result is a short list of relevant transcript segments, each linked to the exact point in the original YouTube video where the answer can be found.

In [1]:
# --- Part 4: Question Answering with Qdrant Retrieval ---

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer
import pandas as pd
from datetime import timedelta




In [2]:
# --- Load sentence embedding model (same as Part 3) ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Connect to Qdrant ---
collection_name = "video_chunks"
client = QdrantClient(host="localhost", port=6333)

# --- Function: Retrieve top-k similar chunks ---
def retrieve_similar_chunks(question, top_k=5):
    query_vector = model.encode([question]).astype('float32')[0]

    results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
        with_payload=True
    )

    chunks = []
    for hit in results:
        payload = hit.payload
        chunks.append({
            "video_id": payload.get("video_id"),
            "chunk_id": payload.get("chunk_id"),
            "text": payload.get("text"),
            "start": payload.get("start"),
            "topic": payload.get("topic"),
            "score": hit.score
        })

    return pd.DataFrame(chunks)


In [3]:
# --- Example usage ---
question = "What is reinforcement learning?"
results_df = retrieve_similar_chunks(question, top_k=5)

# --- Display results with YouTube timestamps ---
for _, row in results_df.iterrows():
    video_id = row["video_id"]
    start_sec = int(row["start"])
    formatted_time = str(timedelta(seconds=start_sec))
    url = f"https://www.youtube.com/watch?v={video_id}&t={start_sec}s"

    print(f"\n [Video ID: {video_id}] — Chunk {row['chunk_id']} — Start: {formatted_time}")
    print(f" Jump to: {url}")
    print(f"Text: {row['text']}")


 [Video ID: t9zxmEHGT1s] — Chunk 333 — Start: 0:16:34
 Jump to: https://www.youtube.com/watch?v=t9zxmEHGT1s&t=994s
Text: called reinforcement learning

 [Video ID: t9zxmEHGT1s] — Chunk 333 — Start: 0:16:34
 Jump to: https://www.youtube.com/watch?v=t9zxmEHGT1s&t=994s
Text: called reinforcement learning

 [Video ID: gkflVEhnA5s] — Chunk 259 — Start: 0:13:58
 Jump to: https://www.youtube.com/watch?v=gkflVEhnA5s&t=838s
Text: kind of reinforcement learning

 [Video ID: gkflVEhnA5s] — Chunk 259 — Start: 0:13:58
 Jump to: https://www.youtube.com/watch?v=gkflVEhnA5s&t=838s
Text: kind of reinforcement learning

 [Video ID: gkflVEhnA5s] — Chunk 32 — Start: 0:01:29
 Jump to: https://www.youtube.com/watch?v=gkflVEhnA5s&t=89s
Text: reinforcement learning kind of


  results = client.search(
