## Multi Agent RAG
- Researcher: 논문에서 데이터를 추출하는 agent
- Verifier: 추출된 데이터를 검사하고 출력하는 agent

In [1]:
from dotenv import load_dotenv
import getpass
import os


def _set_if_undefined(var: str):
    # 주어진 환경 변수가 설정되어 있지 않다면 사용자에게 입력을 요청하여 설정합니다.
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")


# OPENAI_API_KEY 환경 변수가 설정되어 있지 않으면 사용자에게 입력을 요청합니다.
_set_if_undefined("OPENAI_API_KEY")
# LANGCHAIN_API_KEY 환경 변수가 설정되어 있지 않으면 사용자에게 입력을 요청합니다.
_set_if_undefined("LANGCHAIN_API_KEY")

# LangSmith 추적 기능을 활성화합니다. (선택적)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Multi-agent Collaboration"

In [2]:
os.environ["OPENAI_API_KEY"] ="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"

# os.environ['OPENAI_API_KEY'] ="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"

In [3]:
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("Multi-Agent-RAG")

LangSmith 추적을 시작합니다.
[프로젝트명]
Multi-Agent-RAG


### Agent 생성

In [4]:
import json

from langchain_core.messages import (
    BaseMessage,
    FunctionMessage,
    HumanMessage,
)

from langchain.tools.render import format_tool_to_openai_function
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import END, StateGraph
from langgraph.prebuilt.tool_executor import ToolExecutor, ToolInvocation
from langchain_core.output_parsers import JsonOutputParser


def create_agent(llm, tools, system_message: str):
    # 에이전트를 생성합니다.
    functions = [format_tool_to_openai_function(t) for t in tools]

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful AI assistant, collaborating with other assistants."
                " Use the provided tools to progress towards answering the question."
                " If you are unable to fully answer, that's OK, another assistant with different tools "
                " will help where you left off. Execute what you can to make progress."
                " If you or any of the other assistants have the final answer or deliverable,"
                " You have access to the following tools: {tool_names}.\n{system_message}",
            ),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    prompt = prompt.partial(system_message=system_message)
    prompt = prompt.partial(tool_names=", ".join(
        [tool.name for tool in tools]))
    return prompt | llm.bind_functions(functions)

### Tool 정의

In [5]:
from langchain_core.tools import tool
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

def embedding_file(file_folder: str, file_name: str) -> VectorStoreRetriever:
    """문서를 청크 단위로 분할하고 임베딩 모델(text-embedding-ada-002)을 통해 임베딩하여 vector store에 저장합니다. 이후 vector store를 기반으로 검색하는 객체를 생성합니다.

    Args:
        file (str): pdf 문서 경로

    Returns:
        VectorStoreRetriever: 검색기
    """
    ## 긴 텍스트를 작은 청크로 나누는 데 사용되는 클래스
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,         ## 최대 청크 길이 정의
        chunk_overlap=100,      ## 청크 간 겹침 길이 정의
        separators=["\n\n"]     ## 텍스트를 나눌 때 사용할 구분자를 지정 (문단)
    )

    ## PDF 파일 불러오기
    loader = PyPDFLoader(f"{file_folder}/{file_name}.pdf")
    docs = loader.load_and_split(text_splitter=splitter)

    ## Embedding 생성 및 vector store에 저장
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_documents(
        documents=docs,         ## 벡터 저장소에 추가할 문서 리스트
        embedding=embeddings    ## 사용할 임베딩 함수
    )

    ## 검색기로 변환: 현재 벡터 저장소를 기반으로 VectorStoreRetriever 객체를 생성하는 기능을 제공
    retriever = vector_store.as_retriever(
        search_type="similarity",    ## 어떻게 검색할 것인지? default가 유사도
        search_kwargs={"k": 8}
    )

    return retriever

In [6]:
retriever = embedding_file(
    file_folder="../../data/input_data/", 
    file_name="paper_008")

from langchain.tools import Tool

retriever_tool = Tool(
    name="retriever",
    func=retriever.get_relevant_documents,
    description="Retrieve relevant documents based on a query."
)

### Graph 생성

#### 상태 정의

In [7]:
import operator
from typing import Annotated, Sequence, Tuple, TypedDict, Union
from langchain_openai import ChatOpenAI
from typing_extensions import TypedDict

# 각 에이전트와 도구에 대한 다른 노드를 생성할 것입니다. 이 클래스는 그래프의 각 노드 사이에서 전달되는 객체를 정의합니다.
class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    sender: str

#### Agent node 정의

In [24]:
researcher_system_prompt = """You are an expert assistant specializing in extracting information from research papers related to battery technology. Your role is to carefully analyze the provided PDF and extract key data in a structured JSON format. Follow these instructions strictly:

1. **Domain-Specific Focus**:
   - Focus exclusively on content related to battery technology (e.g., materials, synthesis methods, properties, performance metrics).
   - Ignore irrelevant sections or general references outside the battery-related content.

2. **Extraction Guidelines**:
   - Use the JSON structure provided as a template.
   - Replace placeholders with values found in the PDF.
   - If a field is not mentioned in the PDF, write "None" instead of removing it.

3. **Clarity and Precision**:
   - Extract numerical data (e.g., ratios, temperatures, durations) with maximum precision.
   - For descriptive fields, summarize the relevant information concisely without adding interpretations.

4. **Structure Adherence**:
   - Maintain the given JSON structure and formatting exactly.
   - Do not modify or rearrange the JSON schema.

5. **External Reference Exclusion**:
   - Only use information from the provided PDF.
   - Ignore any supplementary information or external references not contained in the PDF.

Your task is to ensure that the extracted data is complete, accurate, and formatted according to the requirements.

Below are instructions for filling out items by referring to the examples.
[ 
    {
        "CAM (Cathode Active Material)": {
            "Stoichiometry information": {
                "NCM-622": {
                    "Li ratio": "1",
                    "Ni ratio": "0.6",
                    "Co ratio": "0.2",
                    "Mn ratio": "0.2",
                    "O ratio": "2"
                },
                "ZrO2-NCM-622 (Z622)": {
                    "Li ratio": "0.98",
                    "Ni ratio": "0.6",
                    "Co ratio": "0.2",
                    "Mn ratio": "0.2",
                    "O ratio": "2"
                }
            },
            "Whether or not commercial NCM was used for each sample (Stoichiometry information in order)": [
                "yes",
                "no"
            ],
            "Lithium source": "LiOH",
            "Synthesis Method for NCM": "co-precipitation",
            "Describe the crystallization method, such as Hydrothermal, Sintering, or any other technique used during the process.": "Hydrothermal",
            "What is the Crystallization final temperature in degree of Celcius used in the process? (e.g., calcination or sintering) mentioned for the crystallization stage.": "100",
            "What is the time duration for the final crystallization process, including any calcination or sintering stages? Specify the hours.": "12",
            "Doping": "Zr4+",
            "Coating": "ZrO2",
            "Additional treatment": "None"
        }
    }
]
"""

verifier_system_prompt = """You are a meticulous verifier agent specializing in the domain of battery technology.
Your primary task is to check the accuracy of information extracted from research papers on batteries, formatted into JSON by another agent. Your responsibilities include validating the following:

Accuracy:
Cross-check the extracted values against the provided PDF. Ensure every field matches the battery-related content in the PDF accurately.

Completeness:
Confirm that all fields in the JSON structure are either filled with accurate values from the battery-related sections of the PDF or marked as "None" if not mentioned in the document.

Consistency:
Verify that the JSON structure, format, and data types adhere strictly to the required schema for battery-related research data.

Corrections:
Identify and highlight any errors, including inaccurate values, missing data, or structural inconsistencies, providing clear and actionable feedback for correction.
For any issues found, specify:

The field in question.
The nature of the issue (e.g., incorrect value, missing data, formatting error).
Suggestions or corrections to resolve the issue.

Final Output:
If the JSON is entirely correct, confirm its validity and output the JSON structure exactly as provided.
After confirming, you should include the phrase `### Final Output` as a heading before printing the JSON. This ensures the output is clearly marked and easy to locate.

Focus exclusively on battery-related content extracted from the PDF.
Ignore any reference content or information outside the provided document."""

In [25]:
import functools

def agent_node(state, agent, name):
    result = agent.invoke(state)
    if isinstance(result, FunctionMessage):
        pass
    else:
        result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
    return {
        "messages": [result],
        "sender": name,
    }


llm = ChatOpenAI(model="gpt-4o", temperature=0.6)

# Research agent and node
research_agent = create_agent(
    llm,
    [retriever_tool],
    system_message=researcher_system_prompt,
)
research_node = functools.partial(agent_node, agent=research_agent, name="Researcher")

# Data_Verifier
verifier_agent = create_agent(
    llm,
    [retriever_tool],
    system_message=verifier_system_prompt,
)
verifier_node = functools.partial(agent_node, agent=verifier_agent, name="Data_Verifier")

# Json_Processor
json_processor_system_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a JSON Processor Agent. Your sole responsibility is to process the response generated by an LLM and ensure the accurate extraction of the JSON content within the response. Follow these instructions precisely:

### Instructions:
1. **Extract JSON Only**:
   - Identify the ```json``` block within the provided response.
   - Extract and output the content within the ```json``` block exactly as it appears.

2. **No Modifications**:
   - Do not modify, add, or remove any part of the JSON content.
   - Preserve the original structure, field names, and values without alteration.

3. **No Hallucination**:
   - Do not interpret, infer, or generate additional content.

4. **Output Format**:
   - Respond with the extracted JSON content only.
   - Do not include any explanations, comments, or surrounding text.
   - The output must be a clean, valid JSON.

### Your Role:
Ensure the integrity and consistency of the JSON data by strictly adhering to these instructions. Your output should always be concise and compliant with the above rules."""
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

json_processor_agent = json_processor_system_prompt | ChatOpenAI(model="gpt-4o-mini", temperature=0.1) | JsonOutputParser()

def json_processor_agent_node(state, agent, name):
    result = agent.invoke(
        {
            "messages": [
                HumanMessage(content=f"""Convert 'Fianl Output' in the given response into a JSON format.: {state["messages"][-1].content}""")
            ]
        }
    )
    return {"messages": result, "name": name}

json_processor_node = functools.partial(json_processor_agent_node, agent=json_processor_agent, name="Json_Processor")

#### Tool Node 정의

In [26]:
tools = [retriever_tool]
tool_executor = ToolExecutor(tools)

def tool_node(state):
    # 그래프에서 도구를 실행하는 함수입니다.
    # 에이전트 액션을 입력받아 해당 도구를 호출하고 결과를 반환합니다.
    messages = state["messages"]
    # 계속 조건에 따라 마지막 메시지가 함수 호출을 포함하고 있음을 알 수 있습니다.
    last_message = messages[-1]
    # ToolInvocation을 함수 호출로부터 구성합니다.
    tool_input = json.loads(
        last_message.additional_kwargs["function_call"]["arguments"]
    )
    # 단일 인자 입력은 값으로 직접 전달할 수 있습니다.
    if len(tool_input) == 1 and "__arg1" in tool_input:
        tool_input = next(iter(tool_input.values()))
    tool_name = last_message.additional_kwargs["function_call"]["name"]
    action = ToolInvocation(
        tool=tool_name,
        tool_input=tool_input,
    )
    # 도구 실행자를 호출하고 응답을 받습니다.
    response = tool_executor.invoke(action)
    # 응답을 사용하여 FunctionMessage를 생성합니다.
    function_message = FunctionMessage(
        content=f"{tool_name} response: {str(response)}", name=action.tool
    )
    # 기존 리스트에 추가될 리스트를 반환합니다.
    return {"messages": [function_message]}

  tool_executor = ToolExecutor(tools)


#### edge 정의

In [27]:
def router(state):
    # 상태 정보를 기반으로 다음 단계를 결정하는 라우터 함수
    messages = state["messages"]
    last_message = messages[-1]
    if "function_call" in last_message.additional_kwargs:
        # 이전 에이전트가 도구를 호출함
        return "call_tool"
    if "Final Output" in last_message.content:
        # 어느 에이전트든 작업이 끝났다고 결정함
        return "process_output"
    return "continue"

#### graph 정의

In [28]:
workflow = StateGraph(AgentState)

workflow.add_node("Researcher", research_node)
workflow.add_node("Data_Verifier", verifier_node)
workflow.add_node("call_tool", tool_node)
workflow.add_node("Json_Processor", json_processor_node)

workflow.add_edge("Json_Processor", END)
workflow.add_conditional_edges(
    "Researcher",
    router,
    {"continue": "Data_Verifier", "call_tool": "call_tool"},
)
workflow.add_conditional_edges(
    "Data_Verifier",
    router,
    {"continue": "Researcher", "call_tool": "call_tool", "process_output": "Json_Processor"},
)
workflow.add_conditional_edges(
    "call_tool",
    lambda x: x["sender"],
    {
        "Researcher": "Researcher",
        "Data_Verifier": "Data_Verifier",
    },
)

workflow.set_entry_point("Researcher")
graph = workflow.compile()    
# graph.get_graph().draw_mermaid_png(output_file_path="graph_multi_agent.png")

### response

In [29]:
# from utils import load_question
# questions = load_question()
# print(questions[0])

In [30]:
result = graph.invoke(
    {
        "messages": [
            HumanMessage(
                content="""Please fill out the following JSON structure by referring to the PDF. Verify accurate values for each field, replacing the placeholders. If the information is not mentioned in the PDF, write "None".

[
    {
        "CAM (Cathode Active Material)": {
            "Stoichiometry information": {
                "": {
                    
                }
            },
            "Whether or not commercial NCM was used for each sample (Stoichiometry information in order)": [
                
            ],
            "Lithium source": ,
            "Synthesis Method for NCM": ,
            "Describe the crystallization method, such as Hydrothermal, Sintering, or any other technique used during the process.": ,
            "What is the Crystallization final temperature in degree of Celcius used in the process? (e.g., calcination or sintering) mentioned for the crystallization stage.": ,
            "What is the time duration for the final crystallization process, including any calcination or sintering stages? Specify the hours.": ,
            "Doping": ,
            "Coating": ,
            "Additional treatment": 
        }
    }
]""",
                name="Researcher"  # Ensure the name is valid, here "Researcher" is used
            )
        ]
    }, {"recursion_limit": 30}
)

/var/folders/nz/glbk0zl17d35x39jy85r5vhc0000gn/T/ipykernel_32907/474617219.py:8: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
  action = ToolInvocation(
/var/folders/nz/glbk0zl17d35x39jy85r5vhc0000gn/T/ipykernel_32907/474617219.py:8: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
/var/folders/nz/glbk0zl17d35x39jy85r5vhc0000gn/T/ipykernel_32907/474617219.py:8: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydan

In [31]:
result["messages"][-1]

{'CAM (Cathode Active Material)': {'Stoichiometry information': {'NCM111': {'Li ratio': '1.02',
    'Ni ratio': '0.33',
    'Co ratio': '0.33',
    'Mn ratio': '0.33',
    'O ratio': '2'},
   'NCM523': {'Li ratio': '1.02',
    'Ni ratio': '0.5',
    'Co ratio': '0.2',
    'Mn ratio': '0.3',
    'O ratio': '2'},
   'NCM622': {'Li ratio': '1.02',
    'Ni ratio': '0.6',
    'Co ratio': '0.2',
    'Mn ratio': '0.2',
    'O ratio': '2'},
   'NCM721': {'Li ratio': '1.02',
    'Ni ratio': '0.7',
    'Co ratio': '0.2',
    'Mn ratio': '0.1',
    'O ratio': '2'},
   'NCM811': {'Li ratio': '1.02',
    'Ni ratio': '0.8',
    'Co ratio': '0.1',
    'Mn ratio': '0.1',
    'O ratio': '2'},
   'NCM851005': {'Li ratio': '1.02',
    'Ni ratio': '0.85',
    'Co ratio': '0.1',
    'Mn ratio': '0.05',
    'O ratio': '2'}},
  'Whether or not commercial NCM was used for each sample (Stoichiometry information in order)': ['yes',
   'yes',
   'yes',
   'yes',
   'yes',
   'yes'],
  'Lithium source': 'None',
 

In [32]:
result["messages"]

[HumanMessage(content='Please fill out the following JSON structure by referring to the PDF. Verify accurate values for each field, replacing the placeholders. If the information is not mentioned in the PDF, write "None".\n\n[\n    {\n        "CAM (Cathode Active Material)": {\n            "Stoichiometry information": {\n                "": {\n                    \n                }\n            },\n            "Whether or not commercial NCM was used for each sample (Stoichiometry information in order)": [\n                \n            ],\n            "Lithium source": ,\n            "Synthesis Method for NCM": ,\n            "Describe the crystallization method, such as Hydrothermal, Sintering, or any other technique used during the process.": ,\n            "What is the Crystallization final temperature in degree of Celcius used in the process? (e.g., calcination or sintering) mentioned for the crystallization stage.": ,\n            "What is the time duration for the final crystall