In [None]:
%pip install --upgrade --quiet langchain-pinecone langchain-openai langchain 

In [1]:
#문서를 쪼갠다 : RecursiveCharacterTextSplitter 사용.
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200, #overlap = 문맥을 이해하기 좋기 위해 어느정도 겹쳐서 진행되도록 한다.
)

loader = Docx2txtLoader('./tax2.docx')
document_list = loader.load_and_split(text_splitter=text_splitter)

In [2]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

load_dotenv()

embedding = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
document_list[52]

In [None]:
import os

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

index_name = 'tax-index'
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

# 먼저 database 객체 생성
database = PineconeVectorStore(index_name=index_name, embedding=embedding)

# documents를 나누어 업로드 (예: 100개씩 배치 처리)
batch_size = 100
for i in range(0, len(document_list), batch_size):
    batch = document_list[i:i + batch_size]
    database.add_documents(batch)

In [6]:
query = '연봉 5천만원인 직장인의 소득세는 얼마인가요?'
retrieved_docs = database.similarity_search(query, k=3)


In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-4o')

from langchain import hub

prompt = hub.pull("rlm/rag-prompt")


In [8]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=database.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)


In [None]:
ai_message = qa_chain.invoke({"query": query})

ai_message