# Overview

add overview

# Motivation

add motivation


# Key components

add key components

# Method details

add method details

# Conclusion

add conclusion


# Import necessary libraries

In [1]:
%pip install --upgrade --quiet arxiv google-search-results>=2.4.2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain_openai import ChatOpenAI

from dotenv import load_dotenv
load_dotenv()

True

# Build agent tools

In [3]:
from langchain_core.tools import tool
from langchain_community.utilities.arxiv import ArxivAPIWrapper
from langchain.pydantic_v1 import BaseModel, Field, root_validator 
from serpapi import GoogleScholarSearch
from typing import Any

# Write Google Scholar API Wrapper
# NOTE: Langchain has a built in tool but it doesn't work
class GoogleScholarAPIWrapper(BaseModel):
    top_k_results: int = Field(description = "top k results obtained by running a query on GoogleScholarSearch")
    serp_api_key: str | None = None
    search_engine: Any | None = None

    @root_validator(pre = True)
    def validate_env(cls, values: dict) -> dict:
        serp_api_key = values.get('serp_api_key')
        if serp_api_key is None:
            serp_api_key = os.environ["SERP_API_KEY"]
        GoogleScholarSearch.SERP_API_KEY = serp_api_key 
        values['search_engine'] = GoogleScholarSearch
        return values
        
    def run(self, query: str) -> str:
        page = 0
        all_results = []
        while page < max((self.top_k_results - 20), 1):
            results = (self.search_engine({"q": query, "start": page, "hl": "en",
                        "num": min(self.top_k_results, 20), "lr": "lang_en"}).get_dict().get("organic_results", []))
            all_results.extend(results)
            if not results:  
                break
            page += 20
        if (self.top_k_results % 20 != 0 and page > 20 and all_results):  # From the last page we would only need top_k_results%20 results
            results = (self.search_engine({"q": query,"start": page,"num": self.top_k_results % 20, "hl": "en", "lr": "lang_en"})
                .get_dict()
                .get("organic_results", []))
            all_results.extend(results)
        if not all_results:
            return "No good Google Scholar Result was found"
        docs = [
            f"Title: {result.get('title','')}\n"
            f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n"  # noqa: E501
            f"Summary: {result.get('publication_info',{}).get('summary','')}\n"
            f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}"
            f"Link: {result.get('resources',[{}])[0].get('link','')}"
            for result in all_results
        ]
        return "\n\n".join(docs)

class GoogleScholarSearchInput(BaseModel):
    query: str = Field(description="The query to search for on arxiv. If the query is in the form of arxiv identifier, it will return the paper corresponding to the arxiv identifier. Otherwise, it will search for the paper using the query.")
    max_papers: int = Field(description="The maximum number of papers to return. It's default to 1, but you can increase it up to 10 in case you need to perform a more comprehensive search.", default=1, ge=1, le=10)


class ArxivSearchInput(BaseModel):
    id: str = Field(description="The arxiv id of the paper to search for.")


In [5]:
@tool("google-scholar-search", args_schema=GoogleScholarSearchInput)
def google_scholar_search(input: str) -> str:
    """Academic search on google scholar.

    Example:
    {"query": "Attention is all you need", "max_papers": 1}

    Returns:
        A list of the relevant papers found with the corresponding summaries.
    """
    try:
        input_obj = GoogleScholarSearchInput(**json.loads(input))
        return GoogleScholarAPIWrapper(top_k_results=input_obj.max_papers).run(input_obj.query)
    except Exception as e:
        return f"Error performing google scholar search: {e}"

@tool("arxiv-search", args_schema=ArxivSearchInput)
def arxiv_search(input: str) -> str:
    """Search for one specific scientific paper on arxiv.

    Example:
    {"id": "2411.05749"}

    Returns:
        A list of the summaries of the papers.
    """
    try:
        input_obj = ArxivSearchInput(**json.loads(input))
        arxiv_api_wrapper = ArxivAPIWrapper(
            top_k_results=1,
            load_max_docs=100,
            ARXIV_MAX_QUERY_LENGTH=1000,  # NOTE: can be chosen better based on the model. we can also summarize it in the tool to make it more scalable
            doc_content_chars_max=10000,  # NOTE: can be chosen better based on the model. if the overall content length is too long, we should handle it somehow  # ? summarization? truncate?
            load_all_available_meta=True,
        )
        if not arxiv_api_wrapper.is_arxiv_identifier(input_obj.id):
            raise ValueError("The arxiv id is not valid")
        
        return arxiv_api_wrapper.run(input_obj.id)
    except Exception as e:
        return f"Error performing arxiv search: {e}"


In [8]:
# TODO: Write agent prompt
# TODO: Move to langgraph
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
tools = [google_scholar_search, arxiv_search]
prompt = hub.pull("hwchase17/react")

agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=False,
    handle_parsing_errors=True,
    max_iterations=20,
    max_execution_time=180,
)



In [11]:
test_inputs = [
    "what is the arxiv paper with id 1409.2329 about? provide a detailed summary of the content",
    "list all the authors of the paper 'Attention is all you need'?",
    "can you find me 8 papers on quantum machine learning?",

]

for test_input in test_inputs:
    out = agent_executor.invoke(
        {
        "input": test_input,
        }   
    )
    print(f"**Input:** {test_input}")
    print(f"**Output:** {out['output']}")
    print("--------------------------------------------")


**Input:** what is the arxiv paper with id 1409.2329 about? provide a detailed summary of the content
**Output:** The arXiv paper with ID 1409.2329 is titled "Recurrent Neural Network Regularization" and was published on February 19, 2015. It presents a regularization technique specifically for Recurrent Neural Networks (RNNs) that utilize Long Short-Term Memory (LSTM) units. The authors, Wojciech Zaremba, Ilya Sutskever, and Oriol Vinyals, discuss the limitations of the dropout technique, which is commonly used for regularizing neural networks, in the context of RNNs and LSTMs. They propose a method for correctly applying dropout to LSTMs, demonstrating that this approach significantly reduces overfitting across various tasks, including language modeling, speech recognition, image caption generation, and machine translation.
--------------------------------------------
**Input:** list all the authors of the paper 'Attention is all you need'?
**Output:** The authors of the paper "Atten