### Data Loaders and Splitters 
- 다양한 확장자 데이터 불러오기 -> UnstructuredFileLoader()
- Tip: ModuleNotFoundError: No module named 'docx.text.hyperlink' 에러났을 경우
``` pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade python-docx ```

In [14]:
from dotenv import load_dotenv
load_dotenv() 

True

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./files/chapter_one.pdf")
loader.load()

[Document(page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\n\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\n\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for\n\nindoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide:\n\nthe face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features.\n\nWinston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working,\n\nand at present the electric current was cut off during daylight hours. It was part of the economy drive in\n\npreparation for Hate Week. The flat was seven flights up, and Winston, who wa

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)
loader = UnstructuredFileLoader("./files/chapter_one.pdf")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into', metadata={'source': './files/chapter_one.pdf'}),
 Document(page_content='his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,', metadata={'source': './files/chapter_one.pdf'}),
 Document(page_content='though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for', metadata={'source': './files/chapter_one.pdf'}),
 Document(page_content='indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide:', metadata={'source': './files/chapter_one.pdf'}),
 Document(page_content='the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features.', metadata={'s

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./files/chapter_one.pdf")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for\nindoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide:', metadata={'source': './files/chapter_one.pdf'}),
 Document(page_content='the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features.\nWinston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working,\nand at present the electric current was cut off during daylight hours. It was part of the economy drive in\npreparation for Hate Week

### Tiktoken & Vector Store

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./files/chapter_one.pdf")


In [9]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector = embedder.embed_query("Hi")


In [10]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector = embedder.embed_documents([
    "Hi",
    "how",
    "are",
    'you'
])
print(len(vector), len(vector[0]))

4 1536


In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore('./.cache/')

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./files/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()

cache_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = Chroma.from_documents(docs, cache_embeddings)

In [12]:
results = vectorstore.similarity_search("where does winston live")
len(results)

4

### RetrievalQA

In [13]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_rerank",
    retriever=vectorstore.as_retriever(),
)

chain.run("Describe Victory Mansions")



'Victory Mansions is a building with seven floors that is not very well maintained. The hallway smelled of boiled cabbage and old rag mats. A large colored poster with the face of a man about forty-five with a black moustache is displayed on the wall. The elevator rarely worked, and the building was part of an economy drive in preparation for Hate Week. The poster with the face had a caption that read "BIG BROTHER IS WATCHING YOU."'

### Stuff LCEL Chain

In [16]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the context. If you don't know the answer, just say you don't know; don't make it up.:\n\n{context}"),
    ("human", "{question}")
])

chain = {"context": retriver, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building with glass doors that has a hallway smelling of boiled cabbage and old rag mats. It is described as having a gritty and dusty entrance. Inside, there is a poster of a large face with a caption that reads "BIG BROTHER IS WATCHING YOU." The building has seven flights of stairs, and the flat is located on the seventh floor.')

### Map Reduce LCEL Chain

In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']
    return "\n\n".join(
        map_doc_chain.invoke({
        "context": doc.page_content,
        "question": question
    }).content for doc in documents
    )

map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(    
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question":RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a run-down, dilapidated apartment building where Winston Smith resides in George Orwell\'s novel "1984." The building lacks basic amenities, with a kitchen containing only a hunk of dark-colored bread for breakfast. The living room has a telescreen in an unusual position, not commanding the whole room as normal. The hallway smells of boiled cabbage and old rag mats. A poster with an enormous face of a man with a black mustache and the caption "BIG BROTHER IS WATCHING YOU" hangs in the building. The surroundings include bombed sites, wooden dwellings, and a view of the Ministry of Truth from the roof. The atmosphere is bleak, oppressive, and reflects the oppressive regime under which the characters live.')