In [None]:
from dotenv import load_dotenv
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_groq import ChatGroq
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from typing_extensions import TypedDict
from typing import TypedDict, Dict
import os
import streamlit as st

In [2]:
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["YOUTUBE_API_KEY"] = os.getenv("YOUTUBE_API_KEY")


In [4]:
youtube = build('youtube', 'v3', developerKey=os.getenv("YOUTUBE_API_KEY"))
embedder = OpenAIEmbeddings(model="text-embedding-3-small")

In [16]:
# llm = ChatGroq(
#     model_name="mixtral-8x7b-32768",
#     temperature=0.7,
#     groq_api_key=os.getenv("GROQ_API_KEY")
# )
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

In [6]:
# class Search_Result:
#     def __init__(self, search_result) -> None:
#         self.video_id = search_result['id']['videoId']
#         self.title = search_result['snippet']['title']
#         self.description = search_result['snippet']['description']
#         self.thumbnails = search_result['snippet']['thumbnails']['default']['url']
#         self.transcript = self._get_transcript()  # New property
#         print(self.video_id)
        
#     def _get_transcript(self):
#         """Retrieve YouTube transcript using youtube-transcript-api"""
#         try:
#             transcript_list = YouTubeTranscriptApi.get_transcript(self.video_id)
#             return " ".join([item['text'] for item in transcript_list])
#         except Exception as e:
#             print(f"Error getting transcript for {self.video_id}: {str(e)}")
#             return ""
# class Search_Response:
#     def __init__(self, search_response) -> None:
#         self.prev_page_token = search_response.get('prevPageToken')
#         self.next_page_token = search_response.get('nextPageToken')
#         self.search_results = [Search_Result(item) for item in search_response.get('items', [])]

# def search_yt(query, max_results=5, page_token=None):
#     request = youtube.search().list(
#         part="snippet",
#         maxResults=max_results,
#         pageToken=page_token,
#         q=query,
#         videoCaption='closedCaption',
#         type='video',
#     )
#     return Search_Response(request.execute())      
        

In [17]:

class SearchResult:
    def __init__(self, search_result):
        self.video_id = search_result['id']['videoId']
        self.title = search_result['snippet']['title']
        self.transcript = self._get_transcript()

    def _get_transcript(self):
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(self.video_id)
            return " ".join([item['text'] for item in transcript_list])
        except:
            return ""

def search_yt(query, max_results=3):
    request = youtube.search().list(
        part="snippet",
        maxResults=max_results,
        q=query,
        videoCaption='closedCaption',
        type='video',
    )
    return request.execute().get('items', [])

# ---- FAISS Indexing ----
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

def create_faiss_index(items):
    documents = []
    for item in items:
        result = SearchResult(item)
        if not result.transcript:
            continue
            
        full_text = f"Title: {result.title}\nTranscript: {result.transcript}"
        chunks = text_splitter.split_text(full_text)
        
        for chunk in chunks:
            documents.append(Document(
                page_content=chunk,
                metadata={
                    "video_id": result.video_id,
                    "title": result.title
                }
            ))
    
    return FAISS.from_documents(documents, embedder) if documents else None


In [18]:
# def create_faiss_index(items):
#     documents = []
#     for item in items:
#         result = SearchResult(item)
#         if not result.transcript:
#             continue
            
#         full_text = f"Title: {result.title}\nTranscript: {result.transcript}"
#         chunks = text_splitter.split_text(full_text)
        
#         for chunk in chunks:
#             documents.append(Document(
#                 page_content=chunk,
#                 metadata={
#                     "video_id": result.video_id,
#                     "title": result.title
#                 }
#             ))
    
#     return FAISS.from_documents(documents, embedder) if documents else None

In [19]:
# def search_yt(query, max_results=5, page_token=None):
#     request = youtube.search().list(
#         part="snippet",
#         maxResults=max_results,
#         pageToken=page_token,
#         q=query,
#         videoCaption='closedCaption',
#         type='video',
#     )
#     return Search_Response(request.execute())

In [20]:
class State(TypedDict, total=False):
    topic: str
    author: str
    question: str
    youtube_summary: str
    objective_check: str
    youtube_check: str
    recent_fact_checks: str
    generate_summary: str
    check_author_style: str
    final_summary: str

In [27]:
def get_author_style(author: str) -> str:
    """Return style notes, with examples for common authors"""
    example_authors = {
        # These are just examples, not required
        "Robert Kiyosaki": "Focus on cashflow, assets vs liabilities, financial education principles",
        "Yuval Noah Harari": "Historical perspective, societal impact analysis, future speculation",
    }
    return example_authors.get(
        author, 
        f"Analytical style with {author}'s known communication patterns"
    )

def get_style_keywords(author: str) -> str:
    """Return style keywords with fallback"""
    example_keywords = {
        "Robert Kiyosaki": "provocative financial advice, personal anecdotes",
        "Yuval Noah Harari": "long-term historical context, interdisciplinary analysis",
    }
    return example_keywords.get(
        author,
        "authoritative yet engaging tone matching the author's typical style"
    )

In [None]:
def YouTube_search(state: State):
      search_response = search_yt(f"{state["topic"]} {state["author"]}", max_results=3)
      
      # Create and save FAISS index
      faiss_index = create_faiss_index(search_response)
      faiss_index.save_local("youtube_index")
      
      # Define prompt template
      # template = """Act as an expert analyst of {author}'s work. Follow these rules strictly:

      # 1. Base answers ONLY on provided context from verified sources:
      #    - YouTube video transcripts
      #    - Published articles
      #    - Author's own writings

      # 2. Match {author}'s distinctive style:
      #    - Philosophical yet accessible tone
      #    - Historical framework for modern issues
      #    - Balanced skepticism about technology
      #    - Emphasis on collective human narratives

      # 3. If question cannot be answered using context:
      #    - Clearly state "This is not my area of expertise" or "I need more sources"
      #    - Do NOT fabricate information
      #    - Do NOT speculate or invent answers
      #    - Do NOT use external knowledge

      # Context provided:
      # {context}

      # Question: {question}

      # Provide a thorough analysis in {author}'s voice:"""
      template = """
    Analyze and respond as {author} would. Rules:
    1. Use ONLY provided context
    2. Maintain {author}'s philosophical/historical style
    3. If context is insufficient, state "I need more information"
    
    Context: {context}
    
    Question: {question}
    
    {author}'s analysis:"""
    
      
      prompt = ChatPromptTemplate.from_template(template)
      
      # Create RAG chain
      rag_chain = (
         {"context": faiss_index.as_retriever(), 
            "question": RunnablePassthrough(),
            "author": lambda _: state["author"]}
         | prompt 
         | llm
         | StrOutputParser()
      )
      question = state["question"]
      state["youtube_summary"] = rag_chain.invoke(question)
      print(state["youtube_summary"])
      return state

In [22]:
## get user input
def get_user_input(state: State):
    topic = 'Artificial Intelligence revolution & Humans'
    author = 'Yuval Noah Harari'
    #question = 'Do Yuval Noah Harari like dog'
    question = 'Will humans be replaced by AI in the future?'
    return {"topic": topic, "author": author, "question": question}

In [23]:
def validate_user_input(state: State):
    objective_check = llm.invoke(f"""
                                 
        you are the expert analyst of {state['author']}'s work. 
        before even we answer the user question, we need to check if the user question is stated in {state['question']} is relevent to the 
         {state['topic']} to double confimr you can also use autor's work {state['youtube_summary']} as context.
       if the question is relevent to the topic, return "True" else return "False"., 
        do not add any other information or explanation.
       

    """).content
    print("B")
    state["objective_check"] = objective_check.strip().lower() == "true"
    print(state["objective_check"])
    return state

In [24]:
def validate_yt_summary(state: State):
    #state["youtube_summary"] = rag_chain.invoke(question)
    objective_check = llm.invoke(f"""
                                 
        you are the expert analyst of {state['author']}'s work. based on the {state["youtube_summary"]} of the youtube video,
        you need to make sure is user question stated in  {state['question']} was answered in {state["youtube_summary"]} accuratedly and completely.

       if the question was answered, return "yes", 
       if not return give me a list of key words on what more information you need be so that you we can use the keywords to search for more information and then answer the user question.
       be specific and accurated in answering the user question.

    """).content
    print("B")
    print(objective_check)
    state["objective_check"] = objective_check
    
    return state

In [25]:
if __name__ == "__main__":
    
    # Search YouTube
    state = get_user_input({})
    YouTube_search(state)
    validate_user_input(state)
    #YouTube_search(topic, author, question, state=state)
    validate_yt_summary(state)
    # Validate YouTube summary 

APIStatusError: Request Header Fields Too Large

In [86]:
print(state["youtube_summary"])

I need more information.


In [64]:
class State(TypedDict, total=False):
    topic: str
    author: str
    question: str
    youtube_summary: str
    objective_check: str
    youtube_check: str
    recent_fact_checks: str
    generate_summary: str
    check_author_style: str
    final_summary: str

In [None]:
from langchain_community.tools import DuckDuckGoSearchRun

In [None]:
search = DuckDuckGoSearchRun()
search.invoke("Kamal hassan about God")

In [None]:
tools=[YouTube_search(topic, author, question)]