In [59]:
import os
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from retriever.retriever_handler import get_retriever
from utils.utils import format_docs
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

In [60]:
# .env 파일 로드
load_dotenv(dotenv_path=".env")

# API 키 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# LangSmith 추적 기능을 활성화합니다. (선택적)
os.environ["LANGCHAIN_TRACING_V2"] = "true"

### SampleNameSearcher

In [61]:
retriever = get_retriever(
            file_folder="./data/raw", 
            file_number=11,
            chunk_size=500, 
            chunk_overlap=100, 
            search_k=10
        )

model = ChatOpenAI(model_name="gpt-4o", temperature=0.1)

sample_name_retriever_prompt = """
You are an expert assistant specializing in extracting information from research papers related to battery technology. Your role is to carefully analyze the provided document.

Document:
{context}
"""

output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()

prompt = ChatPromptTemplate.from_messages([
    ("system", sample_name_retriever_prompt), 
    ("human", "{sample_name_question}")
])

sample_name_searcher_chain = (
    {
        "context": retriever | format_docs, 
        "sample_name_question": RunnablePassthrough()
    }
    | prompt 
    | model 
    | output_parser
)

In [62]:
sample_name_question = """
Use all of the NCM cathode sample names (e.g., 'NCM-622', 'pristine NCM', 'M-NCM') provided in the electrochemical performance section. You just output sample names. Do Not output like '- NCM622' , just output 'NCM622.
"""

In [63]:
sample_names = sample_name_searcher_chain.invoke(sample_name_question)

In [64]:
sample_names

['NR0', 'NR1', 'NR3', 'NR5']

In [65]:
question_c1 = f"""
Extract the following information for {sample_names}:
    - The exact chemical composition (stoichiometry) of the cathode active material (CAM) used in the study.
    - The type of commercially used NCM (e.g., LiNixCoyMnzO2), including code names or abbreviations (e.g., NCM811, N-NCM, SC-NCM) if applicable.
    - The lithium source used for synthesizing the NCM.
    - The synthesis method used for preparing the NCM samples (e.g., co-precipitation, sol-gel), including intermediate steps if reported.
    - The crystallization method used (e.g., solid-state sintering, hydrothermal), and the specific final temperatures and durations used in the process.
    - Whether any doping technique was applied to the CAM, and if so, which element(s) were doped.
    - Whether any coating layer was applied to the CAM, and if so, what material was used for coating.
"""

In [66]:
question_c2 = f"""
Extract the following information for {sample_names}:
    - The electrode composition, including the mass ratios of active material, conductive additive (e.g., Super P), and binder (e.g., PVDF).
    - The type of electrolyte solvent used (e.g., EC/DEC, EC/EMC/DEC), including volume or mass ratios if available.
    - The lithium salt used in the electrolyte (e.g., LiPF6), and its concentration (e.g., 1 M or 1 mol/L).
    - Any additives used in the electrolyte, including their names and concentrations if reported.
    - The mass loading (in mg/cm²) of the NCM active material used in the electrode.
"""

In [67]:
question_c3 = f"""
Extract the following information for {sample_names}:
    - Particle size information from SEM or TEM images, including both secondary and primary particles if available.
    - Descriptions of the particle shape observed in SEM or TEM data.
    - Observations on particle distribution or uniformity reported in the SEM or TEM analysis.
    - If a coating was applied, the reported properties and thickness of the coating layer as observed in SEM or TEM.
    - The crystal structure and lattice characteristics (e.g., crystal plane spacing, presence of layered structure) from structural analysis.
"""

In [68]:
question_c4 = f"""
Extract the following information for {sample_names}:
    - The voltage range and temperature used in electrochemical tests
    - The specific discharge capacities reported at various C-rates:
        - 0.1C
        - 0.2C
        - 0.5C
        - 1.0C
        - 2.0C
        - Any additional C-rate values and performance data reported
"""

In [69]:
sub_questions = [question_c1, question_c2, question_c3, question_c4]

In [70]:
sub_questions

["\nExtract the following information for ['NR0', 'NR1', 'NR3', 'NR5']:\n    - The exact chemical composition (stoichiometry) of the cathode active material (CAM) used in the study.\n    - The type of commercially used NCM (e.g., LiNixCoyMnzO2), including code names or abbreviations (e.g., NCM811, N-NCM, SC-NCM) if applicable.\n    - The lithium source used for synthesizing the NCM.\n    - The synthesis method used for preparing the NCM samples (e.g., co-precipitation, sol-gel), including intermediate steps if reported.\n    - The crystallization method used (e.g., solid-state sintering, hydrothermal), and the specific final temperatures and durations used in the process.\n    - Whether any doping technique was applied to the CAM, and if so, which element(s) were doped.\n    - Whether any coating layer was applied to the CAM, and if so, what material was used for coating.\n",
 "\nExtract the following information for ['NR0', 'NR1', 'NR3', 'NR5']:\n    - The electrode composition, inclu

In [71]:
from retriever.retriever_handler import get_retriever
from langchain.tools import Tool
from langgraph.prebuilt.tool_executor import ToolExecutor, ToolInvocation
import json
from langchain_core.messages import FunctionMessage

## retriever 설정
retriever = get_retriever(
    file_folder="./data/raw", 
    file_number=11,
    chunk_size=500, 
    chunk_overlap=100, 
    search_k=10
)
retriever_tool = Tool(
    name="retriever",
    func=retriever.get_relevant_documents,
    description="Retrieve relevant documents based on a query."
)     

## set tool
tools = [retriever_tool]
tool_executor = ToolExecutor(tools)

def tool_node(state):
    # 그래프에서 도구를 실행하는 함수입니다.
    # 에이전트 액션을 입력받아 해당 도구를 호출하고 결과를 반환합니다.
    messages = state["messages"]
    
    # 계속 조건에 따라 마지막 메시지가 함수 호출을 포함하고 있음을 알 수 있습니다.
    first_message = messages[0]
    last_message = messages[-1]
    
    # ToolInvocation을 함수 호출로부터 구성합니다.
    tool_input = json.loads(last_message.additional_kwargs["function_call"]["arguments"])
    tool_name = last_message.additional_kwargs["function_call"]["name"]
    
    if tool_name == "retriever":
        base_query = tool_input.get("__arg1", "")  # 기존 query 가져오기
        refined_query = f"Context: {first_message.content} | Query: {base_query}"
        tool_input["__arg1"] = refined_query
    
    # 단일 인자 입력은 값으로 직접 전달할 수 있습니다.
    if len(tool_input) == 1 and "__arg1" in tool_input:
        tool_input = next(iter(tool_input.values()))
    
    action = ToolInvocation(
        tool=tool_name,
        tool_input=tool_input,
    )
    
    # 도구 실행자를 호출하고 응답을 받습니다.
    response = tool_executor.invoke(action)
    
    # 응답을 사용하여 FunctionMessage를 생성합니다.
    function_message = FunctionMessage(
        content=f"{tool_name} response: {str(response)}", name=action.tool
    )
    
    # 기존 리스트에 추가될 리스트를 반환합니다.
    return {"messages": [function_message]}

In [72]:
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_openai import ChatOpenAI

def create_agent(llm: ChatOpenAI, tools: list, system_prompt: str):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                system_prompt,
            ),
            MessagesPlaceholder(variable_name="messages"), 
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ]
    )
    agent = create_openai_tools_agent(llm, tools, prompt)
    executor = AgentExecutor(agent=agent, tools=tools) 
    
    return executor

def agent_node(state, agent, name):
    result = agent.invoke(state)

    return {"messages": [HumanMessage(content=result["output"], name=name)]}

In [74]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

In [83]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

members = ["Researcher1", "Researcher2", "Researcher3", "Researcher4"]

system_prompt = (
    "You are a supervisor tasked with managing a conversation between the"
    " following workers:  {members}. Given the following user request,"
    " respond with the worker to act next. Each worker will perform a"
    " task and respond with their results and status. When finished,"
    " respond with FINISH."
)

options = ["FINISH"] + members

function_def = {
    "name": "route",
    "description": "Select the next role.",
    "parameters": {
        "title": "routeSchema",
        "type": "object",
        "properties": {
            "next": {
                "title": "Next",
                "anyOf": [
                    {"enum": options},
                ],
            }
        },
        "required": ["next"],
    },
}

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder(variable_name="messages"),
        (
            "system",
            "Given the conversation above, who should act next?"
            " Or should we FINISH? Select one of: {options}",
        ),
    ]
).partial(options=str(options), members=", ".join(members))

supervisor_chain = (
    prompt
    | llm.bind_functions(functions=[function_def], function_call="route")
    | JsonOutputFunctionsParser()
)

In [84]:
import operator
from typing import Annotated, Any, Dict, List, Optional, Sequence, TypedDict
import functools
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import StateGraph, END

class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    next: str

researcher_system_prompt = """You are a Researcher Agent, responsible for extracting structured and detailed information from scientific papers, specifically in the field of lithium-ion battery materials. Your task is to answer a specific sub-question assigned by the Supervisor by retrieving information from the given paper.
  ## Your Responsibilities:
    1. Extract information strictly from the paper without making assumptions or hallucinations.
    2. Write the response as a single, coherent technical paragraph (not bullet points or lists).
    3. Use precise scientific language, such as:
        - "was synthesized by..."
        - "was reported to exhibit..."
        - "was not mentioned in the paper..."
    4. If a detail is not reported in the paper, explicitly state it (e.g., “The composition data was not mentioned.”).
    5. Maintain logical flow and consistency in your paragraph.

  If your response contains incorrect, missing, or unclear information, you must revise and resubmit it."""

researcher1_agent = create_agent(llm, tools, researcher_system_prompt)
researcher1_node = functools.partial(agent_node, agent=researcher1_agent, name="Researcher1")
researcher2_agent = create_agent(llm, tools, researcher_system_prompt)
researcher2_node = functools.partial(agent_node, agent=researcher2_agent, name="Researcher2")
researcher3_agent = create_agent(llm, tools, researcher_system_prompt)
researcher3_node = functools.partial(agent_node, agent=researcher3_agent, name="Researcher3")
researcher4_agent = create_agent(llm, tools, researcher_system_prompt)
researcher4_node = functools.partial(agent_node, agent=researcher4_agent, name="Researcher4")


workflow = StateGraph(AgentState)
workflow.add_node("Researcher1", researcher1_node)
workflow.add_node("Researcher2", researcher2_node)
workflow.add_node("Researcher3", researcher3_node)
workflow.add_node("Researcher4", researcher4_node)
workflow.add_node("supervisor", supervisor_chain)


for member in members:
    workflow.add_edge(member, "supervisor") 

conditional_map = {k: k for k in members}
conditional_map["FINISH"] = END 

workflow.add_conditional_edges("supervisor", lambda x: x["next"], conditional_map)
workflow.set_entry_point("supervisor")

graph = workflow.compile()

In [None]:
# graph.get_graph().draw_mermaid_png(output_file_path="./multiagentragtext_graph.png")
print(graph.get_graph().draw_mermaid())

%%{init: {'flowchart': {'curve': 'linear'}}}%%
graph TD;
	__start__([<p>__start__</p>]):::first
	Researcher1(Researcher1)
	Researcher2(Researcher2)
	Researcher3(Researcher3)
	Researcher4(Researcher4)
	supervisor(supervisor)
	__end__([<p>__end__</p>]):::last
	Researcher1 --> supervisor;
	Researcher2 --> supervisor;
	Researcher3 --> supervisor;
	Researcher4 --> supervisor;
	__start__ --> supervisor;
	supervisor -.-> Researcher1;
	supervisor -.-> Researcher2;
	supervisor -.-> Researcher3;
	supervisor -.-> Researcher4;
	supervisor -. &nbsp;FINISH&nbsp; .-> __end__;
	classDef default fill:#f2f0ff,line-height:1.2
	classDef first fill-opacity:0
	classDef last fill:#bfb6fc



어쨋든 평가를 해야 할 요소는 RAG의 결과 즉, 데이터를 정확히 추출했냐면, 
굳이 문장으로 뽑지 않고 데이터를 csv 형태로 구축한 후 각 열 별로 평가하는 것이 좋지 않을까
text를 값으로 받는 열의 경우 