#6 R A G  -  Retrieval-Augmented Generation

#6.0 Introduction

#6.1 Data Loaders and Splitters

UnstructuredFileLoader  - supports loading of text files, powerpoints, html, pdfs, images and more.

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredFileLoader


loader = TextLoader(".././files/chapter_one.txt")
#loader = PyPDFLoader(".././files/chapter_one.pdf")
#loader =  UnstructuredFileLoader(".././files/chapter_one.pdf")     # any file can load

len(loader.load())

loader.load()

RecursiveCharacterTextSplitter -  General Splitter 문장이나 단락이 끝날때 나누어진다

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

docs = loader.load()
splitter.split_documents(docs)

Same as below -  short way

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

#len(loader.load_and_split(text_splitter=splitter))
loader.load_and_split(text_splitter=splitter)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap = 50,
)

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

#len(loader.load_and_split(text_splitter=splitter))
loader.load_and_split(text_splitter=splitter)

Character Separator - CharacterTextSplitter

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = CharacterTextSplitter(
    separator="\n", # Seprator에 따라 분리 가능
    chunk_size = 600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

#len(loader.load_and_split(text_splitter=splitter))
loader.load_and_split(text_splitter=splitter)

#6.2 Tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 600,
    chunk_overlap=100,
    length_function = len,
)

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

loader.load_and_split(text_splitter=splitter)

https://platform.openai.com/tokenizer

https://github.com/openai/tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size = 600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader(".././files/chapter_one.txt")

loader.load_and_split(text_splitter=splitter)

6.3 Vectors

https://turbomaze.github.io/word2vecjson/


https://www.youtube.com/watch?v=2eWuYf-aZE4

#6.4 Vector Store

In [None]:
from langchain.embeddings import OpenAIEmbeddings


embedder = OpenAIEmbeddings()

vector = embedder.embed_query("Hi")

len(vector)
vector

In [9]:
from langchain.embeddings import OpenAIEmbeddings


embedder = OpenAIEmbeddings()

vector = embedder.embed_documents(["Hi","how","are","you longer sentences because"])

print(len(vector),len(vector[0]))

4 1536


Save Embedding

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader(".././files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(docs, embeddings)

In [None]:
results =  vectorstore.similarity_search("where does winson live")
len(results)

Cached Embedding - Chroma

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore


cache_dir = LocalFileStore("../.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("../files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

#6.5 Langsmith

https://smith.langchain.com/    Register

#6.6 RetrievalQA

Off-the-shelf Document chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,    
    chain_type="stuff",
    #chain_type="refine",
    #chain_type="map_reduce",
    #chain_type="map_rerank",
    retriever=vectorstore.as_retriever(),
)

chain.run("Describe Victory Mansions")

#6.8 Stuff LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}",
        ),
        ("human", "{question}"),
    ]
)

chain = (
    {
        "context": retriver,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
)

chain.invoke("Describe Victory Mansions")

#6.9 Map Reduce LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

# list of docs
# for doc in list of docs | prompt | lim
# for response in list of llms responses | put them all together
# firnal doc | prompt | lim
# chain.invoke("How many ministries are mentioned")

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    print(inputs)
    documents = inputs["documents"]
    question = inputs["question"]

    # easy understading
    # results=[]
    # for document in documents:
    #     result = map_doc_chain.invoke(
    #         {"context": document.page_content, "question": question}
    #     ).content
    #     results.append(result)
    #     print(result)
    #results = "\n\n".join(results)
    #return results


    # Short python way
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("How many ministries are mentioned")