In [None]:
!pip install langgraph langchain-core langchain-community langchain_groq transformers sentence-transformers pillow easyocr beautifulsoup4 arxiv pymupdf

In [1]:
import os
from typing import TypedDict, Annotated
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from langchain_groq import ChatGroq
from langchain_community.document_loaders import ArxivLoader
from sentence_transformers import SentenceTransformer
import numpy as np
from PIL import Image
import easyocr

In [None]:
from google.colab import userdata
import os
GROQ_API_KEY = userdata.get('GROQ_API_KEY')
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

In [3]:
llm = ChatGroq(model="llama3-70b-8192", temperature=0.1)
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')
ocr_reader = easyocr.Reader(['en'])

In [4]:
arxiv_queries = [
    "attention is all you need",  # Transformer paper
    "retrieval augmented generation", # RAG papers
    "large language models"  # LLM papers
]

print("Loading ArXiv papers...")
all_arxiv_docs = []

for query in arxiv_queries:
    loader = ArxivLoader(
        query=query,
        load_max_docs=2,
        doc_content_chars_max=10000
    )
    docs = loader.load()
    all_arxiv_docs.extend(docs)
    print(f"Loaded {len(docs)} papers for query: '{query}'")

Loading ArXiv papers...
Loaded 3 papers for query: 'attention is all you need'
Loaded 3 papers for query: 'retrieval augmented generation'
Loaded 3 papers for query: 'large language models'


In [5]:
knowledge_base = {
    "text_docs": [doc.page_content[:2000] for doc in all_arxiv_docs],
    "metadata": [doc.metadata for doc in all_arxiv_docs],
    "image_descriptions": [
        "Transformer architecture diagram with attention mechanism",
        "RAG pipeline showing retrieval and generation components",
        "Large language model neural network structure"
    ]
}

text_embeddings = text_embedder.encode(knowledge_base["text_docs"])
print(f"Created embeddings for {len(all_arxiv_docs)} ArXiv papers")

Created embeddings for 9 ArXiv papers


In [6]:
class ArxivBiModalState(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]
    text_results: Annotated[list[str], lambda x, y: [y]]
    image_results: Annotated[list[str], lambda x, y: [y]]
    final_response: str
    arxiv_sources: list

In [7]:
def arxiv_text_retrieval_agent(state: ArxivBiModalState) -> ArxivBiModalState:
    """Performs semantic search on ArXiv papers"""
    query = state["messages"][-1].content

    query_embedding = text_embedder.encode([query])
    similarities = np.dot(query_embedding, text_embeddings.T)[0]
    best_match_idx = np.argmax(similarities)

    best_doc = knowledge_base["text_docs"][best_match_idx]
    metadata = knowledge_base["metadata"][best_match_idx]
    similarity_score = similarities[best_match_idx]

    title = metadata.get('Title', 'Unknown Title')
    authors = metadata.get('Authors', 'Unknown Authors')
    published = metadata.get('Published', 'Unknown Date')
    arxiv_url = metadata.get('entry_id', 'No URL')

    result = f"""ArXiv Paper Match (Score: {similarity_score:.3f})
Title: {title}
Authors: {authors}
Published: {published}
ArXiv ID: {arxiv_url}
Content Preview: {best_doc[:400]}..."""

    return {
        **state,
        "text_results": [result]  # Return as list
    }

In [8]:
def arxiv_image_analysis_agent(state: ArxivBiModalState) -> ArxivBiModalState:
    """Handles image analysis with ArXiv paper context"""
    query = state["messages"][-1].content

    ocr_text = "Mathematical equations and technical diagrams from research paper"

    query_lower = query.lower()
    matching_descriptions = [
        desc for desc in knowledge_base["image_descriptions"]
        if any(word in desc.lower() for word in query_lower.split())
    ]

    best_match = matching_descriptions[0] if matching_descriptions else knowledge_base["image_descriptions"][0]

    result = f"ArXiv Image Analysis - OCR: {ocr_text} | Academic Figure: {best_match}"

    return {
        **state,
        "image_results": [result]  # Return as list
    }

In [17]:
def arxiv_fusion_agent(state: ArxivBiModalState) -> ArxivBiModalState:
    """Combines insights from ArXiv papers and images"""

    text_info = state.get("text_results", ["No text results"])[0]
    image_info = state.get("image_results", ["No image results"])[0]

    arxiv_sources = [meta.get('entry_id', '') for meta in knowledge_base["metadata"]]

    context = f"""
    Query: {state["messages"][-1].content}

    ArXiv Paper Analysis: {text_info}
    Academic Image Analysis: {image_info}

    ArXiv Sources: {arxiv_sources[:3]}
    """

    fusion_prompt = f"""
    Based on the analysis of ArXiv research papers and academic images below, provide a comprehensive response:

    {context}

    Provide a scholarly answer that leverages information from peer-reviewed research papers.
    Include proper academic citations where relevant.
    """

    response = llm.invoke([HumanMessage(content=fusion_prompt)])

    return {
        **state,
        "final_response": response.content,
        "arxiv_sources": arxiv_sources[:3],
        "messages": [response]
    }

In [10]:
def build_arxiv_bimodal_graph_sequential():
    """Build ArXiv-focused bi-modal RAG graph with sequential execution"""

    graph = StateGraph(ArxivBiModalState)

    graph.add_node("arxiv_text_agent", arxiv_text_retrieval_agent)
    graph.add_node("arxiv_image_agent", arxiv_image_analysis_agent)
    graph.add_node("arxiv_fusion_agent", arxiv_fusion_agent)

    graph.add_edge(START, "arxiv_text_agent")
    graph.add_edge("arxiv_text_agent", "arxiv_image_agent")
    graph.add_edge("arxiv_image_agent", "arxiv_fusion_agent")
    graph.add_edge("arxiv_fusion_agent", END)

    return graph.compile()

arxiv_bimodal_agent = build_arxiv_bimodal_graph_sequential()

In [11]:
def test_arxiv_agent(query: str):
    """Test the ArXiv bi-modal RAG agent"""

    initial_state = {
        "messages": [HumanMessage(content=query)],
        "text_results": [],
        "image_results": [],
        "final_response": "",
        "arxiv_sources": []
    }

    result = arxiv_bimodal_agent.invoke(initial_state)

    print("=" * 70)
    print(f"Query: {query}")
    print("=" * 70)
    print(f"ArXiv Text Results: {result['text_results'][0][:300]}...")
    print("-" * 50)
    print(f"Academic Image Results: {result['image_results'][0]}")
    print("-" * 50)
    print(f"ArXiv Sources: {result['arxiv_sources']}")
    print("-" * 50)
    print(f"Scholarly Response: {result['final_response']}")
    print("=" * 70)

In [22]:
print("Testing ArXiv-Based Bi-Modal RAG Agent")
print("="*70)

test_arxiv_agent("Explain the transformer architecture and attention mechanism")
print("\n" + "="*80 + "\n")

test_arxiv_agent("How does retrieval-augmented generation work?")
print("\n" + "="*80 + "\n")

Testing ArXiv-Based Bi-Modal RAG Agent
Query: Explain the transformer architecture and attention mechanism
ArXiv Text Results: [[['ArXiv Paper Match (Score: 0.417)\nTitle: RITA: Group Attention is All You Need for Timeseries Analytics\nAuthors: Jiaming Liang, Lei Cao, Samuel Madden, Zachary Ives, Guoliang Li\nPublished: 2023-06-02\nArXiv ID: No URL\nContent Preview: RITA: Group Attention is All You Need for Timeseries Analytics\nJiaming Liang\nUniversity of Pennsylvania\nPhiladelphia, PA, USA\nliangjm@seas.upenn.edu\nLei Cao∗\nMassachusetts Institute of Technology\nCambridge, MA, USA\nlcao@csail.mit.edu\nSamuel Madden\nMassachusetts Institute of Technology\nCambridge, MA, USA\nmadden@csail.mit.edu\nZachary Ives\nUniversity of Pennsylvania\nPhiladelphia, PA, USA\nzives@cis.up...']]]...
--------------------------------------------------
Academic Image Results: [['ArXiv Image Analysis - OCR: Mathematical equations and technical diagrams from research paper | Academic Figure: Transformer a