In [1]:
import os
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS


from dotenv import load_dotenv
from langchain_core.runnables import RunnableConfig
from langchain_teddynote.messages import random_uuid
import pprint
import argparse

from utils import load_question

from graph import DataExtractor
import json


In [2]:
# .env 파일 로드
load_dotenv(dotenv_path=".env")

# API 키 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# LangSmith 추적 기능을 활성화합니다. (선택적)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Retrieval agent"

In [6]:
!pip install frontend

Collecting frontend
  Downloading frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting starlette>=0.12.0 (from frontend)
  Downloading starlette-0.45.2-py3-none-any.whl.metadata (6.3 kB)
Collecting uvicorn>=0.7.1 (from frontend)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting itsdangerous>=1.1.0 (from frontend)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting aiofiles (from frontend)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading frontend-0.0.3-py3-none-any.whl (32 kB)
Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading starlette-0.45.2-py3-none-any.whl (71 kB)
Downloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: uvicorn, itsdangerous, aiofiles, starlette, frontend
Successfully installed aiofiles-24.1.0 frontend-0.0.3 itsdangerous-2.2.0 starlette-0.45.2 uvicorn-0.34.0


In [9]:
import uuid
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_community.document_loaders import PyPDFLoader



splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=500,         ## 최대 청크 길이 정의
            chunk_overlap=100,      ## 청크 간 겹침 길이 정의
            separators=["\n\n"]     ## 텍스트를 나눌 때 사용할 구분자를 지정 (문단)
        )

        ## PDF 파일 불러오기
loader = PyPDFLoader("data/input_data/paper_011.pdf")
docs = loader.load_and_split(text_splitter=splitter)

        ## Embedding 생성 및 vector store에 저장
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_documents(
            documents=docs,         ## 벡터 저장소에 추가할 문서 리스트
            embedding=embeddings    ## 사용할 임베딩 함수
        )

In [10]:
# 부모 문서의 저장소 계층
store = InMemoryStore()

id_key = "doc_id"

In [11]:
# 검색기 (시작 시 비어 있음)
retriever = MultiVectorRetriever(
    vectorstore=vector_store,
    byte_store=store,
    id_key=id_key,
)

In [12]:
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [13]:
doc_ids

['28ac5517-de5d-4ca2-b354-c94451c104f4',
 'ea9b3fff-6cac-4f5f-b41f-116dc4e59b85',
 'fccb22ac-24fa-4ae5-a7ac-311a12cb1e8d',
 '7974f0cf-2a15-4792-9596-102216237238',
 '3e55bfaa-1b78-47ab-8154-dc6435b87e25',
 'f33ded5e-8e16-40d4-9b42-fe1a68ca6203',
 '220eaa6a-7776-4b17-b260-e2add13e60ab']

In [14]:
# RecursiveCharacterTextSplitter 객체를 생성합니다.
parent_text_splitter = RecursiveCharacterTextSplitter(chunk_size=600)

# 더 작은 청크를 생성하는 데 사용할 분할기
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

In [15]:
parent_docs = []

for i, doc in enumerate(docs):
    # 현재 문서의 ID를 가져옵니다.
    _id = doc_ids[i]
    # 현재 문서를 하위 문서로 분할
    parent_doc = parent_text_splitter.split_documents([doc])

    for _doc in parent_doc:
        # metadata에 문서 ID 를 저장
        _doc.metadata[id_key] = _id
    parent_docs.extend(parent_doc)

In [16]:
parent_docs[0].metadata

{'source': 'data/input_data/paper_011.pdf',
 'page': 0,
 'doc_id': '28ac5517-de5d-4ca2-b354-c94451c104f4'}

In [17]:
child_docs = []
for i, doc in enumerate(docs):
    # 현재 문서의 ID를 가져옵니다.
    _id = doc_ids[i]
    # 현재 문서를 하위 문서로 분할
    child_doc = child_text_splitter.split_documents([doc])
    for _doc in child_doc:
        # metadata에 문서 ID 를 저장
        _doc.metadata[id_key] = _id
    child_docs.extend(child_doc)

In [18]:
print(f"분할된 parent_docs의 개수: {len(parent_docs)}")
print(f"분할된 child_docs의 개수: {len(child_docs)}")

분할된 parent_docs의 개수: 91
분할된 child_docs의 개수: 471


In [19]:
# 벡터 저장소에 parent + child 문서를 추가
retriever.vectorstore.add_documents(parent_docs)
retriever.vectorstore.add_documents(child_docs)

# docstore 에 원본 문서를 저장
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [30]:
# vectorstore의 유사도 검색을 수행합니다.
relevant_chunks = retriever.vectorstore.similarity_search(
    "Stoichiometry information : ­Li0.98Ni0.6Mn0.2Co0.2O2"
)
print(f"검색된 문서의 개수: {len(relevant_chunks)}")

검색된 문서의 개수: 4


In [31]:
for chunk in relevant_chunks:
    print(chunk.page_content, end="\n\n")
    print(">" * 100, end="\n\n")

reaction is given below:
LiNO3 þ 0.33 Ni(NO3)2.6H2O þ 0.33 Co(NO3)2.6H2O þ 0.33
Mn(NO3)2.4H2O þ 2.33 CO(NH2)2 ¼ LiNi0.33Co0.33Mn0.33O2 þ 3.82
N2 þ 10H2O þ2.33 CO2

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

Mn(NO3)2.4H2O þ 2.33 CO(NH2)2 ¼ LiNi0.33Co0.33Mn0.33O2 þ 3.82
N2 þ 10H2O þ2.33 CO2
Different fractions of RGO (1, 3 and 5 wt %) were used to fabri-

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

synthesis of Li1.2Ni0.13Co0.13Mn0.54O2 cathode materials with superior elec-
trochemical performance for lithium-ion batteries, RSC Adv. 6 (2016)
79050e79057.

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

The cathode material with a chemical formula of LiNi1/3Co1/
3Mn1/3O2 was synthesized by the solution combustion followed by
the calcination at 850/C14 C for 15 h, as mentioned above. The XRD

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>