### AGENTIC RAG: For Research Papers

In [1]:
!pip install -Uqq langgraph langchain-google-genai llama-index-embeddings-huggingface llama-index streamlit requests beautifulsoup4 arxiv scholarly langchain-core langchain-community chromadb llama-index-vector-stores-chroma google-ai-generativelanguage sentence-transformers free-proxy feedparser retry

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [15]:
!rm -f papers_cache.json # Clearing cache to avoid stale data

## Step-by_step Implementation

In [17]:
##@ All imports here:
from scholarly import scholarly
import arxiv
import time
import os
import json
import requests
from llama_index.core import VectorStoreIndex, Document, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import chromadb
from langgraph.graph import StateGraph, START, END
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from typing import TypedDict, Sequence
from langchain_core.messages import BaseMessage, HumanMessage
from IPython.display import Image, display
from retry import retry
import logging

In [16]:
#@ Setting up logging
logging.basicConfig(filename="paper_fetcher.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

#! Embedding model ...
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
##@ fetching papers from multiple sources
#! Arxiv First well for others Im going to use google scholar to fetch other pubs papers as well: like IEE and research gate
@retry(tries=1, delay=1, backoff=2)  # Single retry
def fetch_arxiv_papers(query="AI agents", max_results=10):
    try:
        start_time = time.time()
        logging.info(f"Starting arXiv fetch for query: {query}")
        client = arxiv.Client()
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
        papers = []
        for result in client.results(search):
            papers.append({
                "title": result.title,
                "abstract": result.summary,
                "url": result.pdf_url,
                "published": result.published.isoformat(),  # Convert datetime to string
                "source": "arXiv"
            })
        logging.info(f"Fetched {len(papers)} papers from arXiv in {time.time() - start_time:.2f} seconds")
        return papers
    except Exception as e:
        logging.error(f"Error fetching arXiv papers: {e}")
        return []

@retry(tries=1, delay=1, backoff=2)  # Single retry
def fetch_ieee_papers(query="AI agents", max_results=10):
    try:
        start_time = time.time()
        logging.info(f"Starting IEEE fetch for query: {query}")
        # Use requests with proxy
        proxies = {
            'http': 'http://123.45.67.89:8080',  # Replace with a working proxy
            'https': 'https://123.45.67.89:8080'
        }
        session = requests.Session()
        session.proxies.update(proxies)
        search_query = scholarly.search_pubs(f"{query} site:*.ieee.org", timeout=10)  # 10s timeout
        papers = []
        for i, pub in enumerate(search_query):
            if i >= max_results:
                break
            time.sleep(10)  # 10s sleep
            # Ensuring the result is from IEEE
            if "ieee.org" in pub.get("pub_url", "").lower():
                papers.append({
                    "title": pub["bib"]["title"],
                    "abstract": pub.get("abstract", ""),
                    "url": pub.get("pub_url", ""),
                    "source": "IEEE"
                })
        logging.info(f"Fetched {len(papers)} papers from IEEE in {time.time() - start_time:.2f} seconds")
        return papers
    except Exception as e:
        logging.error(f"Error fetching IEEE papers: {e}")
        return []

@retry(tries=1, delay=1, backoff=2)  # Single retry
def fetch_researchgate_papers(query="AI agents", max_results=10):
    try:
        start_time = time.time()
        logging.info(f"Starting ResearchGate fetch for query: {query}")
        # Use requests with proxy
        proxies = {
            'http': 'http://123.45.67.89:8080',
            'https': 'https://123.45.67.89:8080'
        }
        session = requests.Session()
        session.proxies.update(proxies)
        search_query = scholarly.search_pubs(f"{query} site:*.researchgate.net", timeout=10)  # 10s timeout
        papers = []
        for i, pub in enumerate(search_query):
            if i >= max_results:
                break
            time.sleep(10)  # 10s sleep
            # Ensuring the result is from ResearchGate
            if "researchgate.net" in pub.get("pub_url", "").lower():
                papers.append({
                    "title": pub["bib"]["title"],
                    "abstract": pub.get("abstract", ""),
                    "url": pub.get("pub_url", ""),
                    "source": "ResearchGate"
                })
        logging.info(f"Fetched {len(papers)} papers from ResearchGate in {time.time() - start_time:.2f} seconds")
        return papers
    except Exception as e:
        logging.error(f"Error fetching ResearchGate papers: {e}")
        return []

In [19]:
##@ Combining all those to make a single fetching function
def fetch_all_papers(query="AI agents", max_results=10):
    try:
        start_time = time.time()
        logging.info(f"Starting fetch_all_papers for query: {query}")

        # Check cache
        cache_file = "papers_cache.json"
        if os.path.exists(cache_file):
            with open(cache_file, "r") as f:
                cached_papers = json.load(f)
                if cached_papers.get("query") == query and cached_papers.get("max_results") == max_results:
                    logging.info(f"Returning cached papers in {time.time() - start_time:.2f} seconds")
                    return cached_papers["papers"]

        # Fetch papers from all sources
        arxiv_papers = fetch_arxiv_papers(query, max_results)
        logging.info(f"Completed arXiv fetch in {time.time() - start_time:.2f} seconds")
        ieee_papers = fetch_ieee_papers(query, max_results)
        logging.info(f"Completed IEEE fetch in {time.time() - start_time:.2f} seconds")
        researchgate_papers = fetch_researchgate_papers(query, max_results)
        logging.info(f"Completed ResearchGate fetch in {time.time() - start_time:.2f} seconds")

        all_papers = arxiv_papers + ieee_papers + researchgate_papers

        # Deduplicating by title (incase the papers are published in multiple sources)
        seen_titles = set()
        unique_papers = []
        for paper in all_papers:
            if paper["title"].lower() not in seen_titles:
                seen_titles.add(paper["title"].lower())
                unique_papers.append(paper)

        # Cache results
        with open(cache_file, "w") as f:
            json.dump({"query": query, "max_results": max_results, "papers": unique_papers}, f)
        logging.info(f"Cached {len(unique_papers)} papers in {time.time() - start_time:.2f} seconds")

        return unique_papers
    except Exception as e:
        logging.error(f"Error combining papers: {e}")
        # Fallback to arXiv only
        arxiv_papers = fetch_arxiv_papers(query, max_results)
        all_papers = arxiv_papers
        seen_titles = set()
        unique_papers = []
        for paper in all_papers:
            if paper["title"].lower() not in seen_titles:
                seen_titles.add(paper["title"].lower())
                unique_papers.append(paper)
        with open(cache_file, "w") as f:
            json.dump({"query": query, "max_results": max_results, "papers": unique_papers}, f)
        logging.info(f"Fallback: Cached {len(unique_papers)} arXiv papers in {time.time() - start_time:.2f} seconds")
        return unique_papers

## Indexing the papers: Using LlamaIndex and backend using Chromadb

In [20]:
@retry(tries=1, delay=1, backoff=2)  # Single retry
def download_pdf(url, filename):
    try:
        start_time = time.time()
        logging.info(f"Starting PDF download: {url}")
        response = requests.get(url, stream=True, timeout=5)  # 5-second timeout
        response.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        logging.info(f"Downloaded PDF {filename} in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        logging.error(f"Failed to download PDF {url}: {e}")
        raise

def index_papers(papers, collection_name="ai_papers"):
    ##! Initializing chromaindex
    try:
        start_time = time.time()
        logging.info(f"Starting indexing for collection: {collection_name}")
        chroma_client = chromadb.PersistentClient(path="./chroma_db")
        chroma_collection = chroma_client.get_or_create_collection(name=collection_name)

        ##! Initializing the chromavectorstore for llamaindex
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

        documents = []
        os.makedirs("./papers", exist_ok=True)

        for i, paper in enumerate(papers):
            # Validate paper data
            if not paper.get("title") or not paper.get("abstract"):
                logging.warning(f"Skipping paper {i} due to missing title or abstract")
                continue

            doc = Document(
                text=paper["abstract"],
                metadata={
                    "title": paper["title"],
                    "url": paper["url"],
                    "source": paper["source"]
                }
            )
            documents.append(doc)

            ## Now optionally we index the arxiv paper pdfs as well
            ## And yea found out only arxiv provides the pdf_url :)
            if paper["source"] == "arXiv" and paper["url"]:
                filename = f"./papers/paper_{i}.pdf"
                try:
                    download_pdf(paper["url"], filename)
                    pdf_docs = SimpleDirectoryReader(input_files=[filename]).load_data()
                    for pdf_doc in pdf_docs:
                        pdf_doc.metadata.update(doc.metadata)
                        documents.append(pdf_doc)
                except Exception as e:
                    logging.error(f"Failed to process PDF {filename}: {e}")

        ## Finally: vectorstoreindex with chromadb
        from llama_index.core import StorageContext
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(
            documents=documents,
            storage_context=storage_context
        )
        logging.info(f"Indexed {len(documents)} documents in collection {collection_name} in {time.time() - start_time:.2f} seconds")
        return index, chroma_collection
    except Exception as e:
        logging.error(f"Error indexing papers: {e}")
        return None, None

In [21]:
from google.colab import userdata
gemini_key = userdata.get('GOOGLE_API_KEY')
os.environ["GOOGLE_API_KEY"] = gemini_key

In [22]:
##@ Okay now time for langgraph:
from langgraph.graph import StateGraph, START, END
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from typing import TypedDict, Sequence
from langchain_core.messages import BaseMessage, HumanMessage
from IPython.display import Image, display
# Define state
class AgentState(TypedDict):
    messages: Sequence[BaseMessage]
    documents: list

# Initialize LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# Retrieval tool
def retrieve(state):
    query = state["messages"][0].content
    retriever = index.as_retriever(similarity_top_k=3)
    docs = retriever.retrieve(query)
    logging.info(f"Retrieved {len(docs)} documents for query: {query}")
    return {"documents": [doc.text for doc in docs], "messages": state["messages"]}

# Grade documents
def grade_documents(state):
    docs = state["documents"]
    query = state["messages"][0].content
    prompt = PromptTemplate(
        input_variables=["query", "docs"],
        template="Are these documents relevant to the query '{query}'? Answer 'yes' or 'no'."
    )
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({"query": query, "docs": "\n".join(docs)})
    logging.info(f"Document grading result: {response}")
    return {"documents": docs if response.lower() == "yes" else [], "messages": state["messages"]}

# Generate response
def generate(state):
    docs = state["documents"]
    query = state["messages"][0].content
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="Answer the question based on the context:\nContext: {context}\nQuestion: {question}"
    )
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({"context": "\n".join(docs), "question": query})
    logging.info(f"Generated response for query: {query}")
    return {"messages": [HumanMessage(content=response)]}

# Web search fallback (optional)
def web_search(state):
    query = state["messages"][0].content
    logging.warning("Web search not implemented (placeholder)")
    return {"documents": [], "messages": state["messages"]}

# Conditional routing
def route(state):
    docs = state["documents"]
    return "generate" if docs else "web_search"

# Define graph
workflow = StateGraph(AgentState)
workflow.add_node("retrieve", retrieve)
workflow.add_node("grade", grade_documents)
workflow.add_node("generate", generate)
workflow.add_node("web_search", web_search)

workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade")
workflow.add_conditional_edges("grade", route, {"generate": "generate", "web_search": "web_search"})
workflow.add_edge("web_search", "generate")
workflow.add_edge("generate", END)

graph = workflow.compile()

# display(Image(graph.get_graph().draw_mermaid_png()))

In [23]:
query = "AI agents"
papers = fetch_all_papers(query, max_results=5)
for paper in papers:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract'][:100]}...")
    print(f"URL: {paper['url']}")
    print(f"Source: {paper['source']}")
    print("---")

ERROR:root:Error fetching IEEE papers: _Scholarly.search_pubs() got an unexpected keyword argument 'timeout'
ERROR:root:Error fetching ResearchGate papers: _Scholarly.search_pubs() got an unexpected keyword argument 'timeout'


Title: Generalized Neighborhood Attention: Multi-dimensional Sparse Attention at the Speed of Light
Abstract: Many sparse attention mechanisms such as Neighborhood Attention have
typically failed to consistentl...
URL: http://arxiv.org/pdf/2504.16922v1
Source: arXiv
---
Title: OptimAI: Optimization from Natural Language Using LLM-Powered AI Agents
Abstract: Optimization plays a vital role in scientific research and practical
applications, but formulating a...
URL: http://arxiv.org/pdf/2504.16918v1
Source: arXiv
---
Title: Tracing Thought: Using Chain-of-Thought Reasoning to Identify the LLM Behind AI-Generated Text
Abstract: In recent years, the detection of AI-generated text has become a critical
area of research due to co...
URL: http://arxiv.org/pdf/2504.16913v1
Source: arXiv
---
Title: Building A Secure Agentic AI Application Leveraging A2A Protocol
Abstract: As Agentic AI systems evolve from basic workflows to complex multi agent
collaboration, robust proto...
URL: http://arxiv.or

In [24]:
## Testing the indexing with Chromadb
# Test indexing
index, chroma_collection = index_papers(papers, collection_name="ai_papers_test")
if index:
    print("Indexing complete. Collection:", chroma_collection.name)

    # Test retrieval
    retriever = index.as_retriever(similarity_top_k=3)
    docs = retriever.retrieve("What are AI agents?")
    for doc in docs:
        print(f"Text: {doc.text[:100]}...")
        print(f"Metadata: {doc.metadata}")
        print("---")
else:
    print("Indexing failed.")

Indexing complete. Collection: ai_papers_test
Text: Fig. 1. Maestro Architecture - 7 Layers
• AI-Specific Threats: Focuses on the unique threats arising...
Metadata: {'page_label': '5', 'file_name': 'paper_3.pdf', 'file_path': 'papers/paper_3.pdf', 'file_type': 'application/pdf', 'file_size': 1761759, 'creation_date': '2025-04-24', 'last_modified_date': '2025-04-24', 'title': 'Building A Secure Agentic AI Application Leveraging A2A Protocol', 'url': 'http://arxiv.org/pdf/2504.16902v1', 'source': 'arXiv'}
---
Text: of research, we propose to integrate a multi-agent architecture into our system design to
enable mor...
Metadata: {'page_label': '6', 'file_name': 'paper_1.pdf', 'file_path': 'papers/paper_1.pdf', 'file_type': 'application/pdf', 'file_size': 1098105, 'creation_date': '2025-04-24', 'last_modified_date': '2025-04-24', 'title': 'OptimAI: Optimization from Natural Language Using LLM-Powered AI Agents', 'url': 'http://arxiv.org/pdf/2504.16918v1', 'source': 'arXiv'}
---
Text: REFER