# RAG For Document Chunk Retrieval
## 1. Environment Setup

In [None]:
%pip install -q langchain langchain-nvidia-ai-endpoints gradio rich
%pip install -q arxiv pymupdf faiss-cpu
%pip install -U langchain-community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m57.5 MB/s[0m eta [36m0:0

In [None]:
import os
os.environ["NVIDIA_API_KEY"] = "NVIDIA_API_KEY"

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")

instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1")

## 2.  Loading And Chunking Your Documents

In [None]:
import json

from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ArxivLoader

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " "],
)

In [None]:
print("Loading Documents")
docs = [
    ArxivLoader(query="1706.03762").load(),  ## Attention Is All You Need Paper
    ArxivLoader(query="1810.04805").load(),  ## BERT Paper
    ArxivLoader(query="2005.11401").load(),  ## RAG Paper
    ArxivLoader(query="2205.00445").load(),  ## MRKL Paper
    ArxivLoader(query="2310.06825").load(),  ## Mistral Paper
    ArxivLoader(query="2306.05685").load(),  ## LLM-as-a-Judge
    ArxivLoader(query="2210.03629").load(),  ## ReAct Paper
    ArxivLoader(query="2112.10752").load(),  ## Latent Stable Diffusion Paper
    ArxivLoader(query="2103.00020").load(),  ## CLIP Paper
]

Loading Documents


In [None]:
for doc in docs:
    content = json.dumps(doc[0].page_content)
    if "References" in content:
        doc[0].page_content = content[:content.index("References")]

print("Chunking Documents")
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunks = [[c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks]

Chunking Documents


In [None]:
## Make some custom Chunks to give big-picture details
doc_string = "Available Documents:"
doc_metadata = []
for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata.get('Title')
    doc_metadata += [str(metadata)]

extra_chunks = [doc_string] + doc_metadata

In [None]:
pprint(doc_string)
for i, chunks in enumerate(docs_chunks):
    print(f"Document {i}")
    print(f" - # Chunks: {len(chunks)}")
    print(f" - Metadata: ")
    pprint(chunks[0].metadata)
    print()

('Available Documents:\n'
 ' - Attention Is All You Need\n'
 ' - BERT: Pre-training of Deep Bidirectional Transformers for Language '
 'Understanding\n'
 ' - Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\n'
 ' - MRKL Systems: A modular, neuro-symbolic architecture that combines large '
 'language models, external knowledge sources and discrete reasoning\n'
 ' - Mistral 7B\n'
 ' - Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena\n'
 ' - ReAct: Synergizing Reasoning and Acting in Language Models\n'
 ' - High-Resolution Image Synthesis with Latent Diffusion Models\n'
 ' - Learning Transferable Visual Models From Natural Language Supervision')
Document 0
 - # Chunks: 35
 - Metadata: 
{'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion '
            'Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin',
 'Published': '2023-08-02',
 'Summary': 'The dominant sequence transduction models are based on complex '
            'recurrent or\n'


## 3. Construct Your Document Vector Stores

In [None]:
%%time
print("Constructing Vector Stores")
vecstores = [FAISS.from_texts(extra_chunks, embedder)]
vecstores += [FAISS.from_documents(doc_chunks, embedder) for doc_chunks in docs_chunks]

Constructing Vector Stores
CPU times: user 1.31 s, sys: 162 ms, total: 1.47 s
Wall time: 38.9 s


In [None]:
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore

embed_dims = len(embedder.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embedder,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

In [None]:
def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

## Unintuitive optimization; merge_from seems to optimize constituent vector stores away
docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")

Constructed aggregate docstore with 571 chunks


## 4. Implement Your RAG Chain

In [None]:
from langchain.document_transformers import LongContextReorder
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

import gradio as gr
from functools import partial
from operator import itemgetter

In [None]:
## Utility Runnables/Methods
def RPrint(preface=""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        if preface: print(preface, end="")
        pprint(x)
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))

In [None]:
convstore = default_FAISS()

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

In [None]:
initial_msg = (
    "Hello! I am a document chat agent here to help the user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help you?"
)

In [None]:
chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked: {input}\n\n"
    " From this, we have retrieved the following potentially-useful info: "
    " Conversation History Retrieval:\n{history}\n\n"
    " Document Retrieval:\n{context}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)"
), ('user', '{input}')])

stream_chain = chat_prompt| RPrint() | instruct_llm | StrOutputParser()

In [None]:

retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({'history' : lambda d: None})
    | RunnableAssign({'context' : lambda d: None})
)

In [None]:

def chat_gen(message, history=[], return_buffer=True):
    buffer = ""

    retrieval = retrieval_chain.invoke(message)
    line_buffer = ""

    ## Then, stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token
        yield buffer if return_buffer else token

    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)

In [None]:
## Start of Agent Event Loop
test_question = "Tell me about Attention mechanism!"  ## <- modify as desired

## Before you launch your gradio interface, make sure your thing works
for response in chat_gen(test_question, return_buffer=False):
    print(response, end='')

ChatPromptValue(messages=[SystemMessage(content='You are a document chatbot. Help the user as they ask questions about documents. User messaged just asked: Tell me about Attention mechanism!\n\n From this, we have retrieved the following potentially-useful info:  Conversation History Retrieval:\nNone\n\n Document Retrieval:\nNone\n\n (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)', additional_kwargs={}, response_metadata={}), HumanMessage(content='Tell me about Attention mechanism!', additional_kwargs={}, response_metadata={})])
Sure, I'd be happy to explain! Attention mechanism is a concept in machine learning that is particularly useful in tasks like neural machine translation and image captioning, among others. It's a way for models to 'pay attention' to different parts of input data when producing outputs.

In simpler terms, imagine you're translating a long sentence from English to French. Without attention, your model would have 

## 5. Interact with Gradio

In [None]:
chatbot = gr.Chatbot(value = [[None, initial_msg]])
demo = gr.ChatInterface(chat_gen, chatbot=chatbot).queue()

try:
     demo.launch(debug=True, share=True, show_api=False)
     demo.close()
except Exception as e:
     demo.close()
     print(e)
     raise e

  chatbot = gr.Chatbot(value = [[None, initial_msg]])


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://aaad8ea025825c1fd1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ChatPromptValue(messages=[SystemMessage(content='You are a document chatbot. Help the user as they ask questions about documents. User messaged just asked: what are Rag intensive tasks for nlp\n\n From this, we have retrieved the following potentially-useful info:  Conversation History Retrieval:\nNone\n\n Document Retrieval:\nNone\n\n (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)', additional_kwargs={}, response_metadata={}), HumanMessage(content='what are Rag intensive tasks for nlp', additional_kwargs={}, response_metadata={})])
ChatPromptValue(messages=[SystemMessage(content='You are a document chatbot. Help the user as they ask questions about documents. User messaged just asked: how many parameters where used in bert pretraining\n\n From this, we have retrieved the following potentially-useful info:  Conversation History Retrieval:\nNone\n\n Document Retrieval:\nNone\n\n (Answer only from retrieval. Only cite sources that are us

In [None]:
## Save and compress your index
docstore.save_local("docstore_index")
!tar czvf docstore_index.tgz docstore_index

!rm -rf docstore_index

docstore_index/
docstore_index/index.pkl
docstore_index/index.faiss


In [None]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.vectorstores import FAISS

# embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")
!tar xzvf docstore_index.tgz
new_db = FAISS.load_local("docstore_index", embedder, allow_dangerous_deserialization=True)
docs = new_db.similarity_search("Testing the index")
print(docs[0].page_content[:1000])

docstore_index/
docstore_index/index.pkl
docstore_index/index.faiss
. To demonstrate, we build an index using the DrQA [5]\nWikipedia dump from December 2016 and compare outputs from RAG using this index to the newer\nindex from our main results (December 2018). We prepare a list of 82 world leaders who had changed\n7\nTable 4: Human assessments for the Jeopardy\nQuestion Generation Task.\nFactuality\nSpeci\ufb01city\nBART better\n7.1%\n16.8%\nRAG better\n42.7%\n37.4%\nBoth good\n11.7%\n11.8%\nBoth poor\n17.7%\n6.9%\nNo majority\n20.8%\n20.1%\nTable 5: Ratio of distinct to total tri-grams for\ngeneration tasks.\nMSMARCO\nJeopardy QGen\nGold\n89.6%\n90.0%\nBART\n70.7%\n32.4%\nRAG-Token\n77.8%\n46.8%\nRAG-Seq.\n83.5%\n53.8%\nTable 6: Ablations on the dev set. As FEVER is a classi\ufb01cation task, both RAG models are equivalent.\nModel\nNQ\nTQA\nWQ\nCT\nJeopardy-QGen\nMSMarco\nFVR-3\nFVR-2\nExact Match\nB-1\nQB-1\nR-L\nB-1\nLabel Accuracy\nRAG-Token-BM25\n29.7\n41.5\n32.1\n33.1\n17.5\n22