## law_1.docx, law_2.docx
- pinecone store 저장
    - index name: 임의 
- RetrievalQA 구현
    - prompt: rlm/rag-prompt
    - 질문: 전세사기    
    

In [2]:
import os

from dotenv import load_dotenv
from langchain import hub
from langchain_community.document_loaders import Docx2txtLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone


## 환경변수 읽어오기
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')


## 문서 읽고 분할

## 문서 파일 목록
doc_paths = ['law_1.docx', 'law_2.docx']

## 본서 로드(읽어오기)
documents = []


for path in doc_paths:
    loader = Docx2txtLoader(path)
    documents.append(loader.load())

    documents



## 청크 분할 설정

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
)

## 문서 분할
document_list = text_splitter.split_documents(documents)
len(documents)

## 임베딩 -> 벡터 스토어(데이터베이스)에 저장
## 임베딩 모델 지정
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'law'

## 파인콘: 저장
database =  PineconeVectorStore.from_documents(
    documents=document_list,
    embedding=embedding,
    index_name=index_name,
)

## 저장된 인덱스 가져오기
## [방법 1]
# database = PineconeVectorStore(
#     index=pc.Index(index_name),
#     embedding=embedding,
# )

## [방법 2]
# database = PineconeVectorStore.from_existing_index(
#     index_name=index_name,
#     embedding=embedding,
# )



## RetrievalQA
llm = ChatOpenAI(model='gpt-4o')
prompt = hub.pull('rlm/rag-prompt')

def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

qa_chain = (
    {
        'context': database.as_retriever() | format_docs,
        'question': RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

# qa_chain.invoke('전세사기피해자 대상을 알려주세요.')
qa_chain.invoke('전세사기피해자 임대인을 알려주세요.')

AttributeError: 'list' object has no attribute 'page_content'

In [None]:
import os

from dotenv import load_dotenv
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone


## 환경변수 읽어오기
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')


## 벡터 스토어(데이터베이스)에 인덱스 가져오기
## 임베딩 모델 지정
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'law'


## 저장된 인덱스 가져오기
database = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)



## RetrievalQA
llm = ChatOpenAI(model='gpt-4o')
prompt = hub.pull('rlm/rag-prompt')

def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

qa_chain = (
    {
        'context': database.as_retriever() | format_docs,
        'question': RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

# qa_chain.invoke('전세사기피해자 대상을 알려주세요.')
qa_chain.invoke('전세사기피해자 임대인을 알려주세요.')

  from .autonotebook import tqdm as notebook_tqdm


ValueError: Index 'law' not found in your Pinecone project. Did you mean one of the following indexes: law-1-quiz03, law-index, law-2-index, lwa, law-2