In [None]:
# --- Standard library ---
import os
import re
import operator
from typing import TypedDict, Annotated, Literal, Sequence

# --- LangGraph ---
from langgraph.graph import StateGraph, START, END, add_messages
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import tools_condition, ToolNode

# --- LangChain Core ---
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- LangChain Community ---
from langchain_community.document_loaders import ArxivLoader, PyMuPDFLoader, WebBaseLoader
from langchain_community.vectorstores import FAISS

# --- LangChain OpenAI ---
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# --- LangChain Tools ---
from langchain.tools.retriever import create_retriever_tool

# --- LangChain Hub ---
from langchain import hub

# --- LangChain Text Splitters ---
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- Other third-party ---
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled


In [None]:
video_id = "7ARBJQn6QkM"

try:
    # Initialize the API
    ytt_api = YouTubeTranscriptApi()

    # Fetch the transcript
    fetched_transcript = ytt_api.fetch(video_id, languages=['en'])

    # Clean the transcript
    transcript_text = " ".join(snippet.text for snippet in fetched_transcript if snippet.text.strip() and not snippet.text.strip().startswith('['))
    transcript_text = re.sub(r'[\xa0\n]+', ' ', transcript_text)  # Replace non-breaking spaces and newlines
    transcript_text = re.sub(r'\s+', ' ', transcript_text)  # Normalize whitespace
    transcript_text = re.sub(r'\[.*?\]', '', transcript_text)  # Remove non-speech markers
    transcript_text = transcript_text.strip()  # Remove leading/trailing spaces

    print("Cleaned transcript (first 100 chars):", transcript_text[:500] + "...")
    print("Transcript length (chars):", len(transcript_text))
except TranscriptsDisabled:
    print("Transcripts are disabled for this video.")
    transcript_text = ""
except Exception as e:
    print(f"An error occurred: {str(e)}")
    transcript_text = ""

An error occurred: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=7ARBJQn6QkM! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you will be able to continue doing requests for a while. However, YouTube will eventually permanently ban the account that you have used to 

In [None]:
# fetched_transcript[:5]

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript_text])

In [None]:
len(chunks)

0

In [None]:
# chunks[0].page_content

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectore_store = FAISS.from_documents(chunks, embeddings)

IndexError: list index out of range

In [None]:
index_id = vectore_store.index_to_docstore_id[5]

In [None]:
# vectore_store.get_by_ids([index_id])

In [None]:
retriver = vectore_store.as_retriever(search_type="similarity",search_kwargs={"k": 4})

In [None]:
retriver #retriver is a runnable we ccan use invoke()
#also it take the user query as input and returns the list of Documents in our case 4 documents

In [None]:
# output = retriver.invoke("what this podcast is about")

In [None]:
# output[2].page_content

In [None]:
from langchain_openai import ChatOpenAI

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [None]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you Nothing about this is mentioned in the provided context.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [None]:
# question          = "tell me about sam altman"
# retrieved_docs    = retriver.invoke(question)

In [None]:
# retrieved_docs[0].page_content

In [None]:
# context_text = " ".join(doc.page_content for doc in retrieved_docs)
# context_text

In [None]:
# final_prompt = prompt.invoke({"context": context_text, "question": question})

In [None]:
# final_prompt

In [None]:
# answer = llm.invoke(final_prompt)

In [None]:
# answer.content

# **Chains**

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrives_docs):
  context_text = " ".join(doc.page_content for doc in retrives_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriver | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
# parallel_chain.invoke("what is gpt-5")

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
# query_refinement_prompt = PromptTemplate(
#     template="""
# You are helping rewrite vague questions to be clearer and more specific.

# Context: The user is asking about a YouTube video transcript.
# If possible, rewrite the question so that it's directly answerable, includes any important details or time references, and avoids ambiguity.

# User Question: {original_question}

# Refined Question:

# """,
#     input_variables=["original_question"]
# )


In [None]:
user_query = "explain this video for me"

In [None]:
# llm_ready_query = query_refinement_prompt.invoke({'original_question':user_query})

In [None]:
# llm_ready_query

In [None]:
# refined_query = llm.invoke(llm_ready_query)

In [None]:
# refined_query.content

In [None]:
main_chain.invoke(user_query)