In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
# RunnableLambda : chain과 그 내부 어디에서든 function을 호출할 수 있도록 함
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")
docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()

# cache_dir에 있는 embed를 확인하고 업으면 OpenAIEmbeddings 사용
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)


# list of docs -> Victory Mansions을 묘사하는 것과 관련된 document list를 retriever가 반환
retriever = vectorstore.as_retriever()


map_doc_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """
        Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim.
        -------
        {context}
        """
    ),
    ("human", "{question}"),
])

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs) -> str:
    """
    :param inputs: [{"documents": List[Document], "question": str}]
    :return: final chain의 context가 될 부분. str
    """

    documents = inputs['documents']
    question = inputs['question']

    # for doc in list of docs | promt | llm -> 모든 document에 대한 prompt를 만들어서 llm에 전달. 질문: "이 doc을 읽고 question에 답하는 데에 관련이 있는 중요한 정보를 추출해줘"
    # for response in list of llms response | put them all together -> response를 합해서 하나의 긴 document(final doc)
    results = "\n\n".join(
        map_doc_chain.invoke(
            {
                "context": doc.page_content,
                "question": question
            }
        ).content
        for doc in documents
    )

    return results



# document가 필요 -> retriever를 사용해서 얻을 수 있음
# question이 필요. 그래야 llm에 요청할 수 있기 때문
map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough()
} | RunnableLambda(map_docs)


# final doc | prompt | llm -> final doc이 prompt에 입력되어 전달 + llm이 질문에 대답
final_prompt = ChatPromptTemplate.from_messages([
    ("system", """Given the following extracted part of a long document and a question, create a final answer.
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ------
    {context}
    """),
    ("human", "{question}"),
])

# invoke가 실행될 때, map_chain이 context로 들어감
chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions.")

AIMessage(content='Victory Mansions is a dilapidated and run-down residential building located in London, specifically in Airstrip One, which is the third most populous province of Oceania. The building is described as having crumbling walls and broken elevators. The hallways are dimly lit and filled with the smell of boiled cabbage. It is constantly under surveillance by telescreens, which are large television screens that transmit propaganda and monitor the residents. The atmosphere is oppressive, with a sense of constant surveillance and control. The building has glass doors and a hallway that smells of boiled cabbage and old rag mats. There is a large colored poster on one end of the hallway, depicting the face of a man in his forties with a black mustache and ruggedly handsome features. The flat in which Winston Smith resides is located on the seventh floor, and the building has a non-functioning lift due to the electricity being cut off during daylight hours. On each landing, the