# Youtube Video Summarizer

Uses the following tools
- LangChain
- OpenAi
- LLM models
- RAG (vector DB)

In [None]:
!pip3 install --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --quiet python-dotenv
!pip3 install --quiet openai 
!pip3 install youtube-transcript-api
!pip3 install -U langchain-chroma

In [23]:
#Imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.evaluation import load_evaluator
from langchain_chroma import Chroma 
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.pydantic_v1 import BaseModel, Field

from dotenv import load_dotenv

# Other modules and packa
import os
import hashlib

In [2]:
#get env var
load_dotenv()

True

In [3]:
OPEN_AI_API_KEY = os.environ.get("OPENAI_API_KEY")
url = input("What is the YT video Url?")
question = input("Prompt/question about the video")

## Define LLM

In [4]:
llm = ChatOpenAI(model = "gpt-4o-mini") #apiKey will be fetched from env variables
# print(llm.invoke("Tell me a joke"))

In [None]:
# #prompt template
# EXAMPLE_PROMPT_TEMPLATE = """
# You are an assistsnt. If you dont know something say that you don't know. DON'T MAKE UP ANYTHING.

# Answer the Question: {question}
# """

# example_prompt_template = ChatPromptTemplate.from_template(EXAMPLE_PROMPT_TEMPLATE)

## Generate prompt and get responses EXAMPLES

Ways to format a prompt to pass it on to the llm

In [None]:
# prompt = example_prompt_template.format(question = "Generate an ai project topic for a beginner. Dont go in detail. Only give 1 title and one line description.")

# print(prompt)
# llm.invoke(prompt)

In [49]:
# chain = (
#     {"question": RunnablePassthrough()} | example_prompt_template | llm
# )

# chain.invoke("Generate an ai project topic for a beginner. Dont go in detail. Only give 1 title and one line description.")

AIMessage(content='**Title:** "Simple Chatbot using Python"  \n**Description:** Create a basic text-based chatbot that can answer simple questions and engage in conversation with users.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 59, 'total_tokens': 92, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_709714d124', 'finish_reason': 'stop', 'logprobs': None}, id='run-99b2aaed-9265-43c7-85c8-c5c553812149-0', usage_metadata={'input_tokens': 59, 'output_tokens': 33, 'total_tokens': 92, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Get transcript from youtube

In [5]:
def get_transcript(url: str):
    loader = YoutubeLoader.from_youtube_url(url)
    transcript = loader.load()
    return transcript

# print(get_transcript(url))

## Split Transcript and return the list of document chunks

In [6]:
def split_documents(docList, chunk_size : int = 1000, chunk_overlap: int = 100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap, length_function = len, separators=["\n\n", "\n", " "])
    return text_splitter.split_documents(docList)

yt_Transcript = split_documents(get_transcript(url))

## Embedding Documents and queries
We will embed documents and queries to store in the vector DB. 

The queries will be matched with the documents based on how close the embeddings are to the query in the vector DB (using the euclidian distance). Less the distance, more the relevance.

In [7]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    return embeddings

embedding_function = get_embedding_function()
# test_vector = embedding_function.embed_query("Test string")
# len(test_vector)

1536

Check embedding distance between two values

In [31]:
# evalauator = load_evaluator(evaluator="embedding_distance", embedding = embedding_function)
# evalauator.evaluate_strings(prediction="immigration", reference="Canada")

{'score': 0.22374569889973261}

## Create a vector db

In [16]:
def create_db(yt_Transcript, embedding_function, directoryName):
    vectorStore = Chroma(persist_directory=directoryName, embedding_function=embedding_function)
    
    # Generate unique IDs based on content hash
    def hash_text(text):
        return hashlib.md5(text.encode()).hexdigest()  # Hash for consistency

    new_texts = []
    new_ids = []

    for doc in yt_Transcript:
        doc_id = hash_text(doc.page_content)
        if doc_id not in vectorStore.get()['ids']:  # Check if already exists
            new_texts.append(doc.page_content)
            new_ids.append(doc_id)

    if new_texts:
        vectorStore.add_texts(new_texts, ids=new_ids)
        vectorStore.persist()

    return vectorStore

vectorStore = create_db(yt_Transcript, embedding_function, "vectorDB")

## Query Vector store

In [17]:
#load vector db
vectorStore = Chroma(persist_directory="vectorDB", embedding_function=embedding_function)

In [18]:
# For Q&A, the default search type is fine
retriever = vectorStore.as_retriever() # kwargs={'k': 6} to get 6 chunks
relavantChunks = retriever.invoke(question)

### Use relevant chunks as context

In [11]:
PROMPT_TEMPLATE = """
You are an assistsnt. If you dont know something say that you don't know. DON'T MAKE UP ANYTHING.
Use the following context.
{context}

Answer the Question using the above context: {question}
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [25]:
# define an answer structre
class Response(BaseModel):
    Answer : str = Field(description="The response from llm")

In [26]:
# Chain and use the template with context
def format_doc_chunks(chunks):
    return "\n\n".join(doc.page_content for doc in chunks)


'''
retriever returns the chunks to format_doc_chunks which is then passed to context
Runnable passthrough returns the question provided when invoking the rag_chain
the dictionary is passed to the prompt_template
the prompt is then passed to llm
'''
rag_chain = (
    {"context": retriever | format_doc_chunks, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_structured_output(Response, strict=True)
)


rag_chain.invoke(question)



Response(Answer='The video discusses a project focused on information retrieval and AI, specifically highlighting the process of extracting structured information from unstructured data using tools like Docker and the Lang chain library. \n\nKey learnings include:  \n1. **Information Retrieval Challenges:** The video addresses the tedious and time-consuming nature of extracting insights from reports and documents, which is commonplace in many office jobs.  \n2. **Role of AI:** As AI evolves, its potential in automating information retrieval and organizing data into structured formats (like Excel) is emphasized.  \n3. **Lang Chain Framework:** Understanding the Lang chain framework and its core components can open up numerous opportunities in creating intelligent applications.  \n4. **Practical Guidance:** The presenter plans to guide viewers through various modules and provide coding examples within VS Code, making it accessible even for those unfamiliar with Python.  \n5. **Resources 