## 1. Sample Name을 추출하는 Rag LLM 만들기

In [1]:
import os
from dotenv import load_dotenv
from pprint import pprint

from langchain_teddynote import logging

import sys
sys.path.append("../../")

from graph_relevancerag import RelevanceRAG
from graph_ensemblerag import EnsembleRAG
from graph_multiagentrag import MultiAgentRAG
from utils import *

# .env 파일 로드
load_dotenv(dotenv_path=".env")

# API 키 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# LangSmith 추적 기능을 활성화합니다. (선택적)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Multi-agent Collaboration"

# 프로젝트 이름을 입력합니다.
logging.langsmith("RAG-Experiment")

LangSmith 추적을 시작합니다.
[프로젝트명]
RAG-Experiment


##### system prompt 수정

In [5]:
system_prompt = load_system_prompt(config_folder="../../config", category_number=1, rag_method="relevance-rag")

## ../../config/relevance-rag/c1-system-prompt.yaml를 불러왔습니다.
## ../../config/relevance-rag/c1-question.yaml를 불러왔습니다.


In [6]:
pprint(system_prompt)

{'llm_answer_system_prompt': 'You are an expert assistant specializing in '
                             'extracting information from research papers '
                             'related to battery technology. Your role is to '
                             'carefully analyze the provided PDF and extract '
                             'key data in a structured JSON format. Follow '
                             'these instructions strictly:\n'
                             '\n'
                             '1. **Domain-Specific Focus**:\n'
                             '  - Focus exclusively on content related to '
                             'battery technology (e.g., materials, synthesis '
                             'methods, properties, performance metrics).\n'
                             '  - Ignore irrelevant sections or general '
                             'references outside the battery-related content.\n'
                             '\n'
                             '2. *

##### Sample Name Retriever

In [None]:
# ## crew.py
# from tools import embedding_file
# from langchain_openai import ChatOpenAI
# from langchain_core.documents.base import Document
# from langchain.prompts import PromptTemplate
# from langchain_core.output_parsers import CommaSeparatedListOutputParser
# from langchain_core.runnables import RunnablePassthrough

# ## retriever 호출
# retriever = embedding_file(
#     file_folder="../../data/input_data", 
#     file_name="paper_022", 
#     rag_method="relevance-rag", 
#     chunk_size=1000, 
#     chunk_overlap=100, 
#     search_k=10
# )

# ## Sample Name Retriever LLM Prompt
# sample_name_retriever_prompt = """
#   You are an expert assistant specializing in extracting information from research papers related to battery technology. Your role is to carefully analyze the provided document.

#   Document:
#   {context}

#   Question:
#   {question}

#   Answer:
#   """
  
# sample_name_retriever_question = """Use all of the NCM cathode sample names (e.g., 'NCM-622', 'pristine NCM', 'M-NCM') provided in the electrochemical performance section. You just output sample names."""

# llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# # 문서에서 검색하여 관련성 있는 문서를 찾습니다.
# retrieved_docs = retriever.invoke(sample_name_retriever_question)

# def format_docs(docs: list[Document]) -> str:
#     """문시 리스트에서 텍스트를 추출하여 하나의 문자로 합치는 기능을 합니다.

#     Args:
#         docs (list[Document]): 여러 개의 Documnet 객체로 이루어진 리스트

#     Returns:
#         str: 모든 문서의 텍스트가 하나로 합쳐진 문자열을 반환
#     """
#     return "\n\n".join(doc.page_content for doc in docs)


# # 검색된 문서를 형식화합니다.(프롬프트 입력으로 넣어주기 위함)
# retrieved_docs = format_docs(retrieved_docs)

# output_parser = CommaSeparatedListOutputParser()
# format_instructions = output_parser.get_format_instructions()

# # prompt 설정
# prompt = PromptTemplate(
#     template=sample_name_retriever_prompt,
#     input_variables=["context", "question"],
#     partial_variables={"format_instructions": format_instructions},
#     )

# # 체인 호출
# chain = prompt | llm | output_parser 
# response = chain.invoke(
#     {
#         "question": sample_name_retriever_question,
#         "context": retrieved_docs,
#     }
# )

##       paper_022 retriever를 생성했습니다.
##          - chunk_size    :1000
##          - chunk_overlap :100
##          - retrieve_k    :10


In [15]:
from crew import Crew
sample_name_searcher_chain = Crew(file_folder="../../data/input_data", file_number=22).sample_name_searcher()
sample_names = sample_name_searcher_chain.invoke("""Use all of the NCM cathode sample names (e.g., 'NCM-622', 'pristine NCM', 'M-NCM') provided in the electrochemical performance section. You just output sample names. Do Not output like '- NCM622' , just output 'NCM622. """)

##       paper_022 retriever를 생성했습니다.
##          - chunk_size    :1000
##          - chunk_overlap :100
##          - retrieve_k    :10


In [39]:
print(f"## Sample Names: {sample_names}")

## Sample Names: ['pristine', 'V-0.005', 'V-0.01', 'V-0.02']


##### invoke_input 수정

In [97]:

def load_invoke_input(config_folder:str="./config", category_number:int=1, rag_method:str="multiagent-rag", sample_names:list=None) -> Union[tuple, dict]:  
    """
    질문 파일을 불러오고, 주어진 RAG 방식에 따라 적절한 입력 형식을 반환합니다.

    Args:
        config_folder (str, optional): 설정 파일이 저장된 폴더 경로. 기본값은 "./config".
        category_number (int, optional): 불러올 질문 파일의 카테고리 번호. 기본값은 1.
        rag_method (str, optional): RAG 방식 (예: "multiagent-rag", "relevance-rag", "ensemble-rag"). 기본값은 "multiagent-rag".

    Raises:
        KeyError: 지원되지 않는 RAG 방식이 입력된 경우 예외 발생.

    Returns:
        Union[tuple, dict]: RAG 방식에 따라 적절히 구성된 입력 데이터.
    """     
    category_names = ["CAM (Cathode Active Material)", "Electrode (half-cell)", "Morphological Properties", "Cathode Performance"]

    question_file_name = f"c{category_number}-question.yaml"
    question_path = f"{config_folder}/{rag_method}/{question_file_name}"
    with open(question_path, 'r', encoding="utf-8") as file:
        question = yaml.safe_load(file)
    print(f"## {question_path}를 불러왔습니다.")
    
    example_file_name = f"c{category_number}-example.json"
    example_path = f"{config_folder}/{rag_method}/{example_file_name}"
    with open(example_path, 'r', encoding="utf-8") as file:
        json_example = json.load(file)
    print(f"## {example_path}를 불러왔습니다.")
    
    if rag_method == "multiagent-rag": 
        invoke_input = (
            {"messages": [HumanMessage(content=question["question"], name="Researcher")]}, 
            {"recursion_limit": 30}
        )
    
    elif rag_method == "relevance-rag" or rag_method == "ensemble-rag":
        for i, sample_name in enumerate(sample_names):
            if category_number == 1:
                question["template"][category_names[category_number-1]]["Stoichiometry information"][sample_name] = {}
                question["template"][category_names[category_number-1]]["Commercial NCM used"][sample_name] = {}
            elif category_number == 3:
                temp_template = question["template"][category_names[category_number-1]]
                for k in temp_template.keys():
                    question["template"][category_names[category_number-1]][k][sample_name] = None
            elif category_number == 4:
            #     question["template"][category_names[category_number-1]][""]                   
                temp_performance = question["template"]["Cathode Performance"][""]
                question["template"]["Cathode Performance"].update({sample_name:temp_performance})
                if i == len(sample_names)-1:
                    del question["template"]["Cathode Performance"][""]
                 
        config = RunnableConfig(
            recursion_limit=30, 
            configurable={"thread_id": random_uuid()}
            )
        
        invoke_input = {
            "input": {
                "question":f"{question['question_text']}  \n {question['template']}", 
                "example":json_example
            }, 
            "config": config
        }
        print(invoke_input)
    else: 
        raise KeyError(f"Unsupported rag_method: {rag_method}. Please use one of ['multiagent-rag', 'relevance-rag', 'ensemble-rag'].")
    
    return question


In [98]:
question = load_invoke_input(config_folder="../../config", category_number=4, rag_method="relevance-rag", sample_names=sample_names)

## ../../config/relevance-rag/c4-question.yaml를 불러왔습니다.
## ../../config/relevance-rag/c4-example.json를 불러왔습니다.
{'input': {'question': "Fill out in the `null`, `None`, `[]` and `{}` values based on the example format:\n  \n {'Cathode Performance': {'pristine': [{'Voltage range': None, 'Temperature': None, 'C-rate and Specific capacity': [{'C-rate': '0.1', 'Capacity': None}, {'C-rate': '0.2', 'Capacity': None}, {'C-rate': '0.5', 'Capacity': None}, {'C-rate': '1.0', 'Capacity': None}, {'C-rate': '2.0', 'Capacity': None}, {'C-rate': '4.0', 'Capacity': None}, {'Other C-rates and performance': [{'C-rate': None, 'Capacity': None}]}]}], 'V-0.005': [{'Voltage range': None, 'Temperature': None, 'C-rate and Specific capacity': [{'C-rate': '0.1', 'Capacity': None}, {'C-rate': '0.2', 'Capacity': None}, {'C-rate': '0.5', 'Capacity': None}, {'C-rate': '1.0', 'Capacity': None}, {'C-rate': '2.0', 'Capacity': None}, {'C-rate': '4.0', 'Capacity': None}, {'Other C-rates and performance': [{'C-rate': None,

In [99]:
question

{'question_text': 'Fill out in the `null`, `None`, `[]` and `{}` values based on the example format:\n',
 'template': {'Cathode Performance': {'pristine': [{'Voltage range': None,
     'Temperature': None,
     'C-rate and Specific capacity': [{'C-rate': '0.1', 'Capacity': None},
      {'C-rate': '0.2', 'Capacity': None},
      {'C-rate': '0.5', 'Capacity': None},
      {'C-rate': '1.0', 'Capacity': None},
      {'C-rate': '2.0', 'Capacity': None},
      {'C-rate': '4.0', 'Capacity': None},
      {'Other C-rates and performance': [{'C-rate': None,
         'Capacity': None}]}]}],
   'V-0.005': [{'Voltage range': None,
     'Temperature': None,
     'C-rate and Specific capacity': [{'C-rate': '0.1', 'Capacity': None},
      {'C-rate': '0.2', 'Capacity': None},
      {'C-rate': '0.5', 'Capacity': None},
      {'C-rate': '1.0', 'Capacity': None},
      {'C-rate': '2.0', 'Capacity': None},
      {'C-rate': '4.0', 'Capacity': None},
      {'Other C-rates and performance': [{'C-rate': None,


In [None]:
def get_rag_instance(
    rag_method, 
    file_folder, 
    file_number, 
    # db_folder,
    chunk_size, 
    chunk_overlap,
    search_k,
    system_prompt, 
    model_name, 
    save_graph_png
):
    """
    RAG 클래스를 동적으로 받아서 인스턴스를 생성하는 함수
    
    Params:
        rag_method: RAG 방법 ("relevance-rag", "ensemble-rag", "multiagent-rag")
        file_folder: 논문 파일이 위치한 폴더 경로
        file_number: 처리할 논문 번호
        system_prompt: system prompt
        model_name: LLM 모델 명 ("gpt-4o", "gpt-4o-mini")
        save_graph_png: graph 저장 결정
        
    Return:
        생성된 RAG 모델 인스턴스
    """
    
    # RAG 모델 인스턴스 생성
    if rag_method == "relevance-rag":
        return RelevanceRAG(file_folder, file_number, chunk_size, chunk_overlap, search_k, system_prompt, model_name, save_graph_png)
        
    elif rag_method == "ensemble-rag":
        return EnsembleRAG(file_folder, file_number, chunk_size, chunk_overlap, search_k, system_prompt, model_name, save_graph_png)
        
    elif rag_method == "multiagent-rag":
        return MultiAgentRAG(file_folder, file_number, chunk_size, chunk_overlap, search_k, system_prompt, model_name, save_graph_png)


def main(
    data_folder:str="./data",
    file_num_list:list=[11],
    category_number:int=1, 
    chunk_size:int=500, 
    chunk_overlap:int=100, 
    search_k:int=10,       
    config_folder:str="./config",
    rag_method:str="multiagent-rag", 
    model_name:str="gpt-4o", 
    save_graph_png:bool=False, 
):
    category_names = ["CAM (Cathode Active Material)", "Electrode (half-cell)", "Morphological Properties", "Cathode Performance"]
    
    ## system_prompt 와 invoke_input 불러오기
    system_prompt = load_system_prompt(config_folder=config_folder, category_number=category_number, rag_method=rag_method)
    invoke_input = load_invoke_input(config_folder=config_folder, category_number=category_number, rag_method=rag_method)
    
    ## 각 논문에 대해 반복
    for i, file_number in enumerate(file_num_list):
        print(f"#####    {file_number}번째 논문    #####")
        print(f"##       rag method     : {rag_method}")
        print(f"##       category name  : {category_names[category_number-1]}")
        
        ## graph 호출
        voltai_graph = get_rag_instance(
            rag_method=rag_method, 
            file_folder=f"{data_folder}/input_data/", 
            file_number=file_number, 
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            search_k=search_k, 
            system_prompt=system_prompt,
            model_name=model_name, 
            save_graph_png=save_graph_png,
        ).graph
        
        ## 질문이 딕셔너리 형태일 경우와 아닌 경우를 처리
        if isinstance(invoke_input, dict):
            result = voltai_graph.invoke(**invoke_input)
        else:
            result = voltai_graph.invoke(*invoke_input)

        ## RAG method에 따른 결과 확인
        if result.get("answer"):
            temp_answer = result["answer"][0][category_names[category_number-1]]
        elif result.get("discussion"):
            temp_answer = result["discussion"][category_names[category_number-1]]
        elif result.get("messages"):
            temp_answer = result["messages"][-1][category_names[category_number-1]]
        
        print(f"##       print {file_number} result")
        print("------------------------------------")
        pprint(temp_answer, sort_dicts=False)
        
        return temp_answer

In [110]:
retriever = embedding_file(
    file_folder="../../data/input_data", 
    file_name="paper_022", 
    rag_method="relevance-rag", 
    # db_folder=db_folder
    chunk_size=500, 
    chunk_overlap=100, 
    search_k=10
)

##       paper_022 retriever를 생성했습니다.
##          - chunk_size    :500
##          - chunk_overlap :100
##          - retrieve_k    :10


In [116]:
import json
import pandas as pd

In [None]:
rag_method = "multiagent-rag"
file_num_list = [42]
output_folder_path = []
for file_num in file_num_list:
    json_file_num = f"00{file_num}"[-3:]
    output_folder_path.append(f"../../output/json/{rag_method}/paper_{json_file_num}_output")

In [131]:
import yaml
with open("../../config/relevance-rag/c1-question.yaml", 'r', encoding="utf-8") as file:
    question = yaml.safe_load(file)


In [None]:
json.loads(question)

AttributeError: 'dict' object has no attribute 'read'