# Rag Experiment 

In [1]:
# imports

from rich import print
from langchain_community.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
loaders = [
    UnstructuredFileLoader(
        "/teamspace/studios/this_studio/data/2401.08406.pdf",
        post_processors=[clean_extra_whitespace, group_broken_paragraphs],
    ),
    UnstructuredFileLoader(
        "/teamspace/studios/this_studio/data/2401.00908.pdf",
        post_processors=[clean_extra_whitespace, group_broken_paragraphs],
    ),
]

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n"],
    chunk_size=1000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False,
)

docs = []
for loader in loaders:
    docs.extend(
        loader.load_and_split(text_splitter=text_splitter),
    )

In [4]:
print(docs[1])

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [10]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings' : "True"}

embedding_model = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/92.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [13]:
db = FAISS.from_documents(documents = docs, embedding = embedding_model)

In [22]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder(model_name="BAAI/bge-reranker-large", max_length=512)


def rerank_docs(query, retrieved_docs):
    query_and_docs = [(query, r.page_content) for r in retrieved_docs]
    scores = reranker_model.predict(query_and_docs)
    return sorted(list(zip(retrieved_docs, scores)), key=lambda x: x[1], reverse=True)

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [24]:
retriever = db.as_retriever(search_kwargs={"k": 10})

In [15]:
def pretty_print_docs(results, *scores):
    if scores:
        results = zip(results, scores)
    for result in results:
        if isinstance(result,tuple):
            print(result[1])
            print(result[0])
        else:
            print(result)
        print('\n--------\n')

In [17]:
query1 = 'Where was the agricultural dataset collected for the USA?'
query2 = "Where was the agricultural dataset collected for the India?"
query3 = 'How many pdf were used to collect dataset?'
query4 = "What are the metrics used to evaluate the answers?"
query5 = 'how was the content and structure of available document augmented?'
query6 = 'What was the answer generation process used in the Paper?'
query7 = "How many pdf data were collected from the USA?"
query8 = 'What is the DocLLM architecture ?'

In [29]:
queries = [
    query1, 
    query2,
    query3,
    query4,
    query5,
    query6,
    query7,
    query8

]

for i, query in enumerate(queries):
    print(f"Example {i + 1}: Query -> ", query)
    print('..' * 50)
    print('Retrived document:')
    
    retrieved_documents = retriever.get_relevant_documents(query)
    reranked_documents = rerank_docs(query, retrieved_documents)

    print("--" * 50)
    print(reranked_documents[0][0].page_content)
    print("--" * 50)
    print('metadata: ', reranked_documents[0][0].metadata)
    print('--' * 50)


In [33]:
print(query8)
results = db.similarity_search_with_relevance_scores(query1, k = 1)
print(query1)
pretty_print_docs(results)

In [31]:

results = db.similarity_search_with_relevance_scores(query1, k = 1)
print(query2)
pretty_print_docs(results)

In [34]:
print(query8)
results = db.similarity_search_with_relevance_scores(query8, k = 1)
for r in results:
    print(r[0])

In [35]:
retrieved_documents = retriever.get_relevant_documents(query8)
rerank_documents = rerank_docs(query8, retrieved_documents)
print(reranked_documents[0])

In [1]:
from retriver import (
    RAGException,
    create_parent_retriever,
    load_embedding_model,
    load_pdf,
    load_reranker_model,
    retrieve_context
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [9]:
from client import RAGClient

In [10]:
c = RAGClient("/teamspace/studios/this_studio/data/2401.08406.pdf")

In [11]:
c.generate('what is rag')

{'contexts': [(Document(page_content='Listing 3: Prompt used to identify supporting context (i.e., list of locations and agronomic topics mentioned) from a document during question generation.\n\n2.4 Answer Generation\n\nWe employ Retrieval-Augmented Generation (RAG) (Lewis et al., 2020), which is an innovative approach that combines the power of retrieval and generation mechanisms, to create high-quality answers. RAG is particularly useful when dealing with large and complex datasets, as it can effectively recover relevant information associated with a query and use it to enhance the generation process.\n\nThe RAG pipeline begins by retrieving, for a given question, the most relevant documents or passages from our dataset. The retrieval system employs techniques such as BM25, Dense Retrieval (Reimers and Gurevych, 2019; Ni et al., 2022), and other advanced retrieval mechanisms. The retrieved documents serve as a knowledge source for the subsequent generation phase. Once the relevant p

In [12]:
for r in c.stream('what is rag'):
    print(r , end = '')

[32m2024-02-19 09:47:44.751[0m | [1mINFO    [0m | [36mclient[0m:[36mstream[0m:[36m46[0m - [1mListing 3: Prompt used to identify supporting context (i.e., list of locations and agronomic topics mentioned) from a document during question generation.

2.4 Answer Generation

We employ Retrieval-Augmented Generation (RAG) (Lewis et al., 2020), which is an innovative approach that combines the power of retrieval and generation mechanisms, to create high-quality answers. RAG is particularly useful when dealing with large and complex datasets, as it can effectively recover relevant information associated with a query and use it to enhance the generation process.

The RAG pipeline begins by retrieving, for a given question, the most relevant documents or passages from our dataset. The retrieval system employs techniques such as BM25, Dense Retrieval (Reimers and Gurevych, 2019; Ni et al., 2022), and other advanced retrieval mechanisms. The retrieved documents serve as a knowledge sou

Based on the context provided, RAG (Retrieval-Augmented Generation) is an approach that combines retrieval and generation mechanisms to create high-quality answers for given questions. It begins with retrieving relevant documents or passages using techniques like BM25, Dense Retrieval, and advanced retrieval mechanisms. The retrieved information serves as a knowledge source for the subsequent generation phase by guiding the LLM (Language Learning Model) that takes the question and the retrieved documents as inputs to generate contextually appropriate answers. This approach ensures accurate, relevant, and informative Q&A pairs by being guided by the context provided by the retrieved documents. The first step in this process involves computing embeddings from text chunks extracted from PDF documents using sentence transformers and creating a database of these embeddings using Facebook AI Similarity Search for efficient indexing and similarity search of vectors.