# YouTube Chatbot Prototype
This notebook contains the prototype code for a YouTube chatbot that utilizes OpenAI's language models to answer questions based on video transcripts.

## Install Libraries
Make sure to install the necessary libraries before running the code.

In [ ]:
!pip install openai langchain-openai
!pip install -q youtube-transcript-api langchain-community chromadb tiktoken python-dotenv
!pip install yt-dlp

## Make the necessary imports

In [ ]:
import os
from openai import OpenAI
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from langchain_core.prompts import PromptTemplate
import re
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import chromadb.utils.embedding_functions as embedding_functions
from datetime import datetime
from chromadb.config import Settings

## Set the OpenAI API Key

In [ ]:
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"

## Step 1: Indexing
### Download the video transcript

In [ ]:
!yt-dlp --write-auto-sub --sub-lang en --skip-download https://www.youtube.com/watch?v=Ks-_Mh1QhMc

### Clean-up the Transcript to only keep all the text in one big string

In [ ]:
def extract_clean_subtitles(vtt_file):
    subtitle_lines = []
    seen_lines = set()  # Avoid duplicate lines

    with open(vtt_file, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()

            # Skip noise
            if (
                not line
                or line.startswith(("WEBVTT", "Kind:", "Language:"))
                or "-->" in line
                or re.match(r"^\[\w+\]$", line)  # [Music], [Applause], etc.
            ):
                continue

            # Remove tags like <00:00:16.760><c>word</c>
            cleaned_line = re.sub(r"<\d{2}:\d{2}:\d{2}\.\d{3}><c>(.*?)</c>", r"\1", line)

            # Avoid duplicates
            if cleaned_line not in seen_lines:
                subtitle_lines.append(cleaned_line)
                seen_lines.add(cleaned_line)

    return " ".join(subtitle_lines)

# Example usage
file_path = "Your Body Language May Shape Who You Are ｜ Amy Cuddy ｜ TED [Ks-_Mh1QhMc].en.vtt"
transcript = extract_clean_subtitles(file_path)

In [ ]:
print(transcript[:100])

In [ ]:
print(len(transcript))
print(transcript[:100])

### Splitting the Big transcript into smaller chunks

In [ ]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [ ]:
print(len(chunks))
print(chunks[0])

## Step 2: Embedding Generation and Storing in Vector Store

In [ ]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [ ]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                model_name="text-embedding-3-small",
                api_key_env_var="OPENAI_API_KEY"
            )

In [ ]:
# Create persistent client to store the vector embeddings permanently
persistent_client = chromadb.Client(Settings(persist_directory="./chroma_db"))

collection = persistent_client.create_collection(
    name="yt_collection", 
    embedding_function=openai_ef,
    metadata={
        "description": "my first Chroma collection",
        "created": str(datetime.now())
    }  
)

In [ ]:
collection.add(
    documents=[chunk.page_content for chunk in chunks],
    ids=[str(i) for i in range(len(chunks))]
)

## Step 3: Retrieval

In [ ]:
results = collection.query(
    query_texts="How does my body language shape who I am?",
    n_results=4,
)

results

## Step 4: Augmentation

In [ ]:
prompt = PromptTemplate(
        template="""
            You are a helpful assistant.
            You are answering people's query regarding content of a youtube video, therefore answer only from the context given below.
            If context is insufficient, just say that you don't know.

            Context: {context}
            Query: {query}
        """,
        input_variables=['context', 'query']
)

In [ ]:
# query = "Is the topic of aliens discussed in this video? If yes, summarize that part"
query = "Is the topic of body posture in this video? If yes, summarize that part"
retrieved_docs = collection.query(
    query_texts=query,
    n_results=4,
)

In [ ]:
retrieved_docs

In [ ]:
context_text = "\n\n".join(retrieved_docs["documents"][0])
print(context_text)

In [ ]:
final_prompt = prompt.invoke({"context": context_text, "query": query})

In [ ]:
final_prompt

## Generation

In [ ]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [ ]:
response = llm.invoke(final_prompt)

In [ ]:
response.content

In [ ]:
response

## Step 5: Building a Chain

In [ ]:
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [ ]:
def join_docs(retrieved_docs):
    context_text = "\n\n".join(retrieved_docs["documents"][0])
    return context_text

In [ ]:
def query_collection(question):
    return collection.query(
                    query_texts=question,
                    n_results=4
                )

In [ ]:
parallel_chain = RunnableParallel({
    'context': RunnableLambda(query_collection) | RunnableLambda(join_docs),
    'query': RunnablePassthrough()
})

In [ ]:
parallel_chain.invoke("Who is Amy Cuddy?")

In [ ]:
parser = StrOutputParser()

In [ ]:
main_chain = parallel_chain | prompt | llm | parser

In [ ]:
main_chain.invoke("Can you summarize the video?")