In [16]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
  temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# 캐시가 저장되어 있는지 확인
# 저장되어 있으면 캐시에서 가져오고, 저장되어있지 않으면 캐시로 저장함
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)

# vectorstore = Chroma.from_documents(docs, cached_embeddings)
vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

'''
document 가 너무 많은 상황에서는 stuff방식은 사용할 수 없고, map reduce를 사용해야함
1. list of docs
2. for doc in list of docs | propmt | llm
3. for response in list of llms response | put them all together
4. final doc | propmpt | llm
'''

map_doc_prompt = ChatPromptTemplate.from_messages([
  ('system',
    """
    Use the following portion of a long document to see if any if the text is relevant to answer the question. Return any relevant to text verbatim.
    ---------
    {context}
    """
   ),
   ('human', '{question}')
])

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
  documents = inputs['documents']
  question = inputs['question']
  # results = []
  # for document in documents:
  #   result = map_doc_chain.invoke({
  #     "context": document.page_content,
  #     "question": question
  #   }).content
  #   results.append(result)
  # results = "\n\n".join(results)
  # return results
  # 위 코드와 아래의 코드의 결과는 같음
  return "\n\n".join(
    map_doc_chain.invoke(
    {"context":document.page_content, "question": question}
    ).content 
    for document in documents
  )




map_chain = {'documents': retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
  ('system',
    '''
    Given the following extracted parts of a long document and a question, create a final answer.
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ---------
    {context}
    '''
  ),
  ('human', '{question}')
])

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("Where does Winston go to work?")

AIMessage(content='Winston goes to work at the Ministry of Truth.')