<a href="https://colab.research.google.com/github/Akataruka/YouTube_ChatBot_RAG/blob/main/Youtube_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing the libraries
!pip install -q youtube-transcript-api langchain-community langchain faiss-cpu tiktoken python-dotenv sentence-transformers langchain_groq

In [12]:
# Importing the Dependencies
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [4]:
import re
def extract_youtube_video_id(url):
    """
    Function to extract the video id from teh youtube video link
    """
    pattern = r"(?:youtube\.com/(?:watch\?v=|embed/|v/)|youtu\.be/)([a-zA-Z0-9_-]{11})"
    match = re.search(pattern, url)
    return match.group(1) if match else None

In [17]:
# Get the transcript by using the video id
def get_transcript(video_id,):
  try:
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages= ["en"])

    # Flatten it to plain text
    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    return transcript

  except (TranscriptsDisabled, NoTranscriptFound):
        print("No transcript available for this video.")
        return None
  except Exception as e:
        print("An error occurred:", e)
        return None

In [6]:
def get_chunks(transcript, chunk_size = 1000, overlap = 200):
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
  chunks = splitter.create_documents([transcript])
  print("No of chuncks : ", len(chunks))
  return chunks

In [7]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
import sys
video_url = input("Enter the Url")
if video_url:
  video_id = extract_youtube_video_id(video_url)
  if video_id == None:
    print("Invalid URL")
    sys.exit()
  print("Video id is : ", video_id)
  transcript = get_transcript(video_id)  # Get the transcipt of the video
  if transcript == None:
    sys.exit()
  print("Transcript Extracted")
  # print(transcript)
  chunks = get_chunks(transcript)  # Tokenize the transcript
  print("Chunks Created")
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Create the embedding model class

  # Create the FAISS vector store using the free embeddings
  vector_store = FAISS.from_documents(chunks, embeddings)   # get the embeddings and create the vector store
  retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3}) # create the retriver class to get relevant docs

else:
  print("Enter valid URL")

# Initialise the llm modle class
llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,   # Replace with your actual API key or use env var
    model_name="llama3-70b-8192",       # Groq's LLaMA3 model
    temperature=0.2
)

# Create a prompt template
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

print("You are all set now u can ask AI to assist you")


In [None]:
while True:
  question = input("ASK AI : ")
  if question == None:
    continue
  if question == "q":
    break

  parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
    })
  parser = StrOutputParser()
  main_chain = parallel_chain | prompt | llm | parser
  result = main_chain.invoke(question)
  print(result)