In [11]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET

# Namespace for ArXiv's Atom-based XML format.
ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'

def extract_from_arxiv(search_query='cat:cs.AI', max_results=100, json_file_path='files/arxiv_dataset.json'):
    """
    Fetches papers from the ArXiv API based on a search query, saves them as JSON, 
    and returns a pandas DataFrame.

    Args:
        search_query (str): The search query for ArXiv (default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.
    """
    
    # Construct the URL for the API request.
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}'
    
    # Send a GET request to the ArXiv API.
    response = requests.get(url)
    
    # Parse the XML response.
    root = ET.fromstring(response.content)
    
    papers = []
    
    # Loop through each "entry" in the XML, representing a single paper.
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title = entry.find(f'{ARXIV_NAMESPACE}title').text.strip()
        summary = entry.find(f'{ARXIV_NAMESPACE}summary').text.strip()

        # Get the authors of the paper.
        author_elements = entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = [author.find(f'{ARXIV_NAMESPACE}name').text for author in author_elements]

        # Get the paper's URL.
        paper_url = entry.find(f'{ARXIV_NAMESPACE}id').text
        arxiv_id = paper_url.split('/')[-1]

        # Check for the PDF link.
        pdf_link = next((link.attrib['href'] for link in entry.findall(f'{ARXIV_NAMESPACE}link') 
                         if link.attrib.get('title') == 'pdf'), None)

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': paper_url,
            'pdf_link': pdf_link
        })
    
    # Convert list into a pandas DataFrame.
    df = pd.DataFrame(papers)
    
    # Save the DataFrame to a JSON file.
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')
    
    return df


In [12]:
df = extract_from_arxiv(max_results=20)

Data saved to files/arxiv_dataset.json ...


In [13]:
import json

file_name = "files/arxiv_dataset.json"
with open(file_name, "r") as f:
    data = json.load(f)
print(data)

[{'title': 'Dynamic Backtracking', 'summary': 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.', 'authors': ['M. L. Ginsberg'], 'arxiv_id': '9308101v1', 'url': 'http://arxiv.org/abs/cs/9308101v1', 'pdf_link': 'http://arxiv.org/pdf/cs/9308101v1'}, {'title': 'A Market-Oriented Programming Environment and its Application to\n  Distributed Multicommodity Flow Problems', 'summary': 'Market price systems constitute a well-understood class of mechanisms that\nunder certain conditions provide effec

In [16]:
import pandas as pd
df = pd.DataFrame(data)
df.sample(5)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
5,Decidable Reasoning in Terminological Knowledg...,Terminological knowledge representation system...,"[M. Buchheit, F. M. Donini, A. Schaerf]",9312101v1,http://arxiv.org/abs/cs/9312101v1,http://arxiv.org/pdf/cs/9312101v1
6,Teleo-Reactive Programs for Agent Control,A formalism is presented for computing and org...,[N. Nilsson],9401101v1,http://arxiv.org/abs/cs/9401101v1,http://arxiv.org/pdf/cs/9401101v1
18,Operations for Learning with Graphical Models,This paper is a multidisciplinary review of em...,[W. L. Buntine],9412102v1,http://arxiv.org/abs/cs/9412102v1,http://arxiv.org/pdf/cs/9412102v1
17,Wrap-Up: a Trainable Discourse Module for Info...,The vast amounts of on-line text now available...,"[S. Soderland, Lehnert. W]",9412101v1,http://arxiv.org/abs/cs/9412101v1,http://arxiv.org/pdf/cs/9412101v1
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1


In [19]:
import pandas as pd
import requests
import os

def download_pdfs(df, download_folder="files"):
    """
    Downloads PDFs from URLs listed in the DataFrame and saves them to a specified folder. 
    The file names are stored in a new column 'pdf_file_name' in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing a 'pdf_link' column with URLs to download.
        download_folder (str): Path to the folder where PDFs will be saved (default is 'files').
    
    Returns:
        pd.DataFrame: The original DataFrame with an additional 'pdf_file_name' column containing 
                      the paths of the downloaded PDF files or None if the download failed."""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    pdf_file_names = []

    for index, row in df.iterrows():
        pdf_link = row["pdf_link"]
       
        try:
            response = requests.get(pdf_link)
            response.raise_for_status()
        
            file_name = os.path.join(download_folder, pdf_link.split("/")[-1]) + ".pdf"
            pdf_file_names.append(file_name)

            with open(file_name, "wb") as f:
                f.write(response.content)
            
            print(f"PDF downloaded successfully and saved as {file_name}")
        
        except requests.exceptions.RequestException as e:
           print(f"Failed to download the PDF: {e}")
           pdf_file_names.append(None)
    
    df["pdf_file_name"] = pdf_file_names
    
    return df

In [20]:
df = download_pdfs(df)
df

PDF downloaded successfully and saved as files/9308101v1.pdf
PDF downloaded successfully and saved as files/9308102v1.pdf
PDF downloaded successfully and saved as files/9309101v1.pdf
PDF downloaded successfully and saved as files/9311101v1.pdf
PDF downloaded successfully and saved as files/9311102v1.pdf
PDF downloaded successfully and saved as files/9312101v1.pdf
PDF downloaded successfully and saved as files/9401101v1.pdf
PDF downloaded successfully and saved as files/9402101v1.pdf
PDF downloaded successfully and saved as files/9402102v1.pdf
PDF downloaded successfully and saved as files/9402103v1.pdf
PDF downloaded successfully and saved as files/9403101v1.pdf
PDF downloaded successfully and saved as files/9406101v1.pdf
PDF downloaded successfully and saved as files/9406102v1.pdf
PDF downloaded successfully and saved as files/9408101v1.pdf
PDF downloaded successfully and saved as files/9408102v1.pdf
PDF downloaded successfully and saved as files/9408103v1.pdf
PDF downloaded successfu

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1,files/9308101v1.pdf
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1,files/9308102v1.pdf
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[I. P. Gent, T. Walsh]",9309101v1,http://arxiv.org/abs/cs/9309101v1,http://arxiv.org/pdf/cs/9309101v1,files/9309101v1.pdf
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[F. Bergadano, D. Gunetti, U. Trinchero]",9311101v1,http://arxiv.org/abs/cs/9311101v1,http://arxiv.org/pdf/cs/9311101v1,files/9311101v1.pdf
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[J. C. Schlimmer, L. A. Hermens]",9311102v1,http://arxiv.org/abs/cs/9311102v1,http://arxiv.org/pdf/cs/9311102v1,files/9311102v1.pdf
5,Decidable Reasoning in Terminological Knowledg...,Terminological knowledge representation system...,"[M. Buchheit, F. M. Donini, A. Schaerf]",9312101v1,http://arxiv.org/abs/cs/9312101v1,http://arxiv.org/pdf/cs/9312101v1,files/9312101v1.pdf
6,Teleo-Reactive Programs for Agent Control,A formalism is presented for computing and org...,[N. Nilsson],9401101v1,http://arxiv.org/abs/cs/9401101v1,http://arxiv.org/pdf/cs/9401101v1,files/9401101v1.pdf
7,Learning the Past Tense of English Verbs: The ...,Learning the past tense of English verbs - a s...,[C. X. Ling],9402101v1,http://arxiv.org/abs/cs/9402101v1,http://arxiv.org/pdf/cs/9402101v1,files/9402101v1.pdf
8,Substructure Discovery Using Minimum Descripti...,The ability to identify interesting and repeti...,"[D. J. Cook, L. B. Holder]",9402102v1,http://arxiv.org/abs/cs/9402102v1,http://arxiv.org/pdf/cs/9402102v1,files/9402102v1.pdf
9,Bias-Driven Revision of Logical Domain Theories,The theory revision problem is the problem of ...,"[M. Koppel, R. Feldman, A. M. Segre]",9402103v1,http://arxiv.org/abs/cs/9402103v1,http://arxiv.org/pdf/cs/9402103v1,files/9402103v1.pdf


In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk_pdf(pdf_file_name, chunk_size=512):
    """
    Loads a PDF file and splits its content into chunks of a specified size.

    Args:
        file (str): Path to the PDF file to be loaded.
        chunk_size (int): The maximum size of each chunk in characters (default is 512).

    Returns:
        List[Document]: A list of document chunks.
    """
    print(f"Loading and splitting into chunks: {pdf_file_name}")

    loader = PyPDFLoader(pdf_file_name)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=64)
    chunks = text_splitter.split_documents(data)
    
    return chunks

In [44]:
def expand_df(df):
    """
    Expands each row in the DataFrame by splitting PDF documents into chunks.

    Args:
        df (pd.DataFrame): DataFrame containing 'pdf_file_name', 'arxiv_id', 'title', 'summary', 
                           'authors', and 'url' columns.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a chunk of the original document, 
                      with additional metadata such as chunk identifiers and relationships to 
                      adjacent chunks.
    """

    expanded_rows = []

    for idx, row in df.iterrows():
        try:
            chunks = load_and_chunk_pdf(row["pdf_file_name"])
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']}: {e}")
            continue

        for i, chunk in enumerate(chunks):
            prechunk_id = i-1 if i > 0 else ""
            postchunk_id = i+1 if i < len(chunks)-1 else ""

            expanded_rows.append({
                "id": f"{row['arxiv_id']}#{i}",
                "title": row["title"],
                "summary": row["summary"],
                "authors": row["authors"],
                "arxiv_id": row["arxiv_id"],
                "url": row["url"],
                "chunk": chunk.page_content,
                "prechunk_id": "" if i == 0 else f"{row['arxiv_id']}#{prechunk_id}",
                "postchunk_id": "" if i == len(chunks)-1 else f"{row['arxiv_id']}#{postchunk_id}"
            })

    return pd.DataFrame(expanded_rows)

In [45]:
expanded_df = expand_df(df)

Loading and splitting into chunks: files/9308101v1.pdf
Loading and splitting into chunks: files/9308102v1.pdf
Loading and splitting into chunks: files/9309101v1.pdf
Loading and splitting into chunks: files/9311101v1.pdf
Loading and splitting into chunks: files/9311102v1.pdf
Loading and splitting into chunks: files/9312101v1.pdf
Loading and splitting into chunks: files/9401101v1.pdf
Loading and splitting into chunks: files/9402101v1.pdf
Loading and splitting into chunks: files/9402102v1.pdf
Loading and splitting into chunks: files/9402103v1.pdf
Loading and splitting into chunks: files/9403101v1.pdf
Loading and splitting into chunks: files/9406101v1.pdf
Loading and splitting into chunks: files/9406102v1.pdf
Loading and splitting into chunks: files/9408101v1.pdf
Loading and splitting into chunks: files/9408102v1.pdf
Loading and splitting into chunks: files/9408103v1.pdf
Loading and splitting into chunks: files/9409101v1.pdf
Loading and splitting into chunks: files/9412101v1.pdf
Loading an

In [36]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [31]:
import os
from getpass import getpass
from semantic_router.encoders import OpenAIEncoder

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass("OpenAI API key: ")

encoder = OpenAIEncoder(name="text-embedding-3-small")

In [40]:
dims = len(encoder("hello hallo hola salut")[0])
dims

1536

In [39]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

from pinecone import Pinecone, ServerlessSpec

api_key = os.getenv("PINECONE_API_KEY") or getpass("Pinecone API key: ")

pc = Pinecone(api_key=api_key)

spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1"
)

In [42]:
import time

index_name = "langgraph-research-agent"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=dims,
        metric="cosine",
        spec=spec
    )

    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)


index = pc.Index(index_name)

time.sleep(1)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

NameError: name 'expanded_df' is not defined

In [52]:
from tqdm.auto import tqdm

data = expanded_df
batch_size = 64

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)
    batch = data[i:i_end].to_dict(orient="records")

    metadata = [{
        'arxiv_id': r["arxiv_id"],
        'title': r["title"],
        'summary': r['chunk']
    } for r in batch]

    ids = [r["id"] for r in batch]

    chunks = [r["chunk"] for r in batch]
    
    embeds = encoder(chunks)

    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/73 [00:00<?, ?it/s]

In [53]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 4608}},
 'total_vector_count': 4608,
 'vector_type': 'dense'}

In [54]:
import requests

arxiv_id = '1706.03762'

res = requests.get(f"https://arxiv.org/abs/{arxiv_id}")
res.text

'<!DOCTYPE html>\n<html lang="en">\n\n<head>  <title>[1706.03762] Attention Is All You Need</title>\n  <meta name="viewport" content="width=device-width, initial-scale=1">\n  <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png">\n  <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png">\n  <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png">\n  <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest">\n  <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5">\n  <meta name="msapplication-TileColor" content="#da532c">\n  <meta name="theme-color" content="#ffffff">\n  <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" />\n  <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/a

In [55]:
import re

abstract_pattern = re.compile(
    r'<blockquote class="abstract mathjax">\s*<span class="descriptor">Abstract:</span>\s*(.*?)\s*</blockquote>',
    re.DOTALL
)

re_match = abstract_pattern.search(res.text)

if re_match:
    print(re_match.group(1))
else:
    print("Abstract not found.")

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transforme

In [56]:
from langchain_core.tools import tool
import requests
import re

abstract_pattern = re.compile(
    r'<blockquote class="abstract mathjax">\s*<span class="descriptor">Abstract:</span>\s*(.*?)\s*</blockquote>',
    re.DOTALL
)

@tool("fetch_arxiv")
def fetch_arxiv(arxiv_id: str) -> str:
    """
    Fetches the abstract from an ArXiv paper 
    """
    res = requests.get(f"https://arxiv.org/abs/{arxiv_id}")

    re_match = abstract_pattern.search(res.text)

    return re_match.group(1) if re_match else "Abstract not found."

In [57]:
arxiv_id = '1706.03762'
output = fetch_arxiv.invoke(input={'arxiv_id': arxiv_id})
print(output)

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transforme

In [61]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [65]:
from serpapi import GoogleSearch
import os
from getpass import getpass

serpapi_params = {
    "engine": "google",
    "api_key": os.getenv("SERPAPI_KEY") or getpass("SerpAPI key:")
}

search = GoogleSearch({
    **serpapi_params, 
    'q': 'water',
    'num': 5
})

results = search.get_dict().get('organic_results', [])

formatted_results = "\n---\n".join(
    ['\n'.join([x['title'], x['snippet'], x['link']]) for x in results]
)

print(formatted_results)

Water
Water is an inorganic compound with the chemical formula H 2 O. It is a transparent, tasteless, odorless, [c] and nearly colorless chemical substance
https://en.wikipedia.org/wiki/Water
---
Waters Corporation | Laboratory Instruments ...
Waters is the leading provider of lab equipment, supplies and software for scientists across the world. Easily research and order everything your lab needs!
https://www.waters.com/nextgen/us/en.html?srsltid=AfmBOoqYd8yOakHzeE7BmFKnnOpw96OKebnHEh6ebUtPbmGdEsD2ULRh
---
Primo Water: Water Delivery for Your Home & Business
Stay refreshed anywhere with our reliable beverage and water delivery to your home and office. Experience premium taste and service from Primo Water.
https://www.water.com/
---
Water | H2O | CID 962
Water (chemical formula: H2O) is a transparent fluid which forms the world's streams, lakes, oceans and rain, and is the major constituent of the fluids of organisms. As a chemical compound, a water molecule contains one oxygen and two 

In [67]:
from serpapi import GoogleSearch

@tool("web_search")
def web_search(query: str) -> str:
    """
    Finds general knowledge information using a Google search.

    Args:
        query (str): The search query string.

    Rerturns:
        str: A formatted string of the top search results, including title, snippet, and link.
    """

    search = GoogleSearch({
    **serpapi_params, 
    'q': 'water',
    'num': 5
    })

    return formatted_results if results else "No results found."


output = web_search.invoke(input={"query": "water on mars"})
print(output)

Water
Water is an inorganic compound with the chemical formula H 2 O. It is a transparent, tasteless, odorless, [c] and nearly colorless chemical substance
https://en.wikipedia.org/wiki/Water
---
Waters Corporation | Laboratory Instruments ...
Waters is the leading provider of lab equipment, supplies and software for scientists across the world. Easily research and order everything your lab needs!
https://www.waters.com/nextgen/us/en.html?srsltid=AfmBOoqYd8yOakHzeE7BmFKnnOpw96OKebnHEh6ebUtPbmGdEsD2ULRh
---
Primo Water: Water Delivery for Your Home & Business
Stay refreshed anywhere with our reliable beverage and water delivery to your home and office. Experience premium taste and service from Primo Water.
https://www.water.com/
---
Water | H2O | CID 962
Water (chemical formula: H2O) is a transparent fluid which forms the world's streams, lakes, oceans and rain, and is the major constituent of the fluids of organisms. As a chemical compound, a water molecule contains one oxygen and two 

In [68]:
def format_rag_contexts(matches: list) -> str:
    """
    Formats the retrieved context matches into a readable string format.

    Args:
        matches (list): A list of matched documents with metadata.

    Returns:
        str: A formatted string of document titles, chunks, and ArXiv IDs.
    """

    formatted_results = []

    for x in matches:
        text = (
            f"Title: {x['metadata']['title']}\n"
            f"Chunk: {x['metadata']['chunk']}\n"
            f"ArXiv ID: {x['metadata']['arxiv_id']}\n"
        )
        formatted_results.append(text)
    
    return "\n---\n".join(formatted_results)



In [69]:
from langchain_core.tools import tool

@tool
def rag_search_filter(query: str, arxiv_id: str) -> str:
    """
    Finds information from the ArXiv database using a natural language query and a specific ArXiv ID.

    Args:
        query (str): The search query in natural language.
        arxiv_id (str): The ArXiv ID of the specific paper to filter by.

    Returns:
        str: A formatted string of relevant document contexts.
    """

    xq = encoder([query])
    xc = index.query(query=xq, top_k=6, include_metadata=True, filter={"arxiv_id": arxiv_id})

    return format_rag_contexts(xc['matches'])

In [70]:
@tool('rag_search')
def rag_search(query: str) -> str:
    """
    Finds specialist information on AI using a natural language query.

    Args:
        query (str): The search query in natural language.
    
    Returns:
        str: A formatted string of relevant document contexts.
    """
    xq = encoder([query])
    xc = index.query(query=xq, top_k=5, include_metadata=True)

    return format_rag_contexts(xc['matches'])

In [71]:
from langchain_core.tools import tool

@tool
def final_answer(
    introduction: str,
    research_steps: str or list,
    main_body: str,
    conclusion: str,
    sources: str or list
) -> str:
    """
    Returns a natural language response in the form of a research report.

    Args:
        introduction (str): A short paragraph introducing the user's question and the topic.
        research_steps (str or list): Bullet points or text explaining the steps taken for research.
        main_body (str): The bulk of the answer, 3-4 paragraphs long, providing high-quality information.
        conclusion (str): A short paragraph summarizing the findings.
        sources (str or list): A list or text providing the sources referenced during the research.

    Returns:
        str: A formatted research report string.
    """

    if isinstance(research_steps, list):
        research_steps = "\n".join([f"- {r}" for r in research_steps])
    
    if isinstance(sources, list):
        sources = "\n".join([f"- {s}" for s in sources])
    
    return f"{introduction}\n\nResearch Steps:\n{research_steps}\n\nMain Body:n{main_body}\n\nConclusion:\n{conclusion}\n\nSources:\n{sources}"

In [73]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_prompt = (
    """
    You are the oracle, the great AI decision-maker.
    Given the user's query, you must decide what to do with it based on the 
    list of tools provided to you.

    If you see that a tool has been used (in the scratchpad) with a particular
    query, do NOT use that same tool with the same query again. Also, do NOT use
    any tool more than twice (i.e., if the tool appears in the scratchpad twice, do 
    not use it again).

    You should aim to collect information from a diverse range of sources before
    providing the answer to the user. Once you have collected plenty of information
    to answer the user's question (stored in the scratchpad), use the final_answer tool.
    """
)

prompt = ChatPromptTemplate.from_messages([
    ('system', system_prompt),
    
    MessagesPlaceholder(variable_name='chat_history'),
    
    ('user', '{input}'),
    
    ('assistant', 'scratchpad: {scratchpad}'),
])

In [None]:
from langchain_core.messages import ToolCall, ToolMessage
from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(
    model="gpt-4o",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    temperature=0
)

tools = [
    rag_search_filter,
    rag_search,
    fetch_arxiv,
    web_search,
    final_answer
]

def create_scratchpad(intermediate_steps: list[ToolCall]) -> str:
    research_steps = []

    for i, action in enumerate(intermediate_steps):
        if action.log != "TBD":
            research_steps.append(
                f"Tool: {action.tool}, input: {action.tool_input}\n"
                f"Output: {action.log}"
            )

    return "\n---\n".join(research_steps)