<a href="https://colab.research.google.com/github/ArushiAgrawal/RAG-Document-Referencing/blob/main/RAG_Referencing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Demo RAG pipeline for checking Referening within documents

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer

In [None]:
# Dummy data
documents = {
    "Document A": {
        "sections": {
            "Section 1.1": {
                "content": "Elephants are the largest land animals. They live in grasslands and forests, primarily in Africa and Asia.",
                "references": []
            },
            "Section 1.2": {
                "content": "Elephants are herbivores, primarily eating grass, leaves, and fruits. For more information on carnivorous animals, refer to Section 1.3.",
                "references": ["Section 1.3"]
            },
            "Section 1.3": {
                "content": "Lions are carnivores that hunt herbivores like zebras and antelopes. For carnivorous birds, see Document B, Section 2.1.",
                "references": ["Document B, Section 2.1"]
            }
        }
    },
    "Document B": {
        "sections": {
            "Section 2.1": {
                "content": "Eagles are powerful carnivorous birds. They hunt small mammals, fish, and other birds. For more on carnivorous mammals, see Document A, Section 1.3.",
                "references": ["Document A, Section 1.3"]
            },
            "Section 2.2": {
                "content": "Eagles have keen eyesight and can spot prey from great distances. They are found in various habitats around the world.",
                "references": []
            }
        }
    },
    "Document C": {
        "sections": {
            "Section 3.1": {
                "content": "Dolphins are intelligent marine mammals. They are known for their complex social behaviors and communication.",
                "references": []
            },
            "Section 3.2": {
                "content": "Dolphins primarily eat fish and squid. For details on carnivorous birds that hunt fish, refer to Document B, Section 2.1.",
                "references": ["Document B, Section 2.1"]
            }
        }
    }
}

'''
Referencing -
Doc A 1.1 - NO
Doc A 1.2 - Doc A 1.3
Doc A 1.3 - Doc B 2.1

Doc B 2.1 - Doc A 1.3
Doc B 2.2 - NO

Doc C 3.1 - NO
Doc C 3.2 - Doc B 2.1

'''

'\nReferencing - \nDoc A 1.1 - NO\nDoc A 1.2 - Doc A 1.3\nDoc A 1.3 - Doc B 2.1\n\nDoc B 2.1 - Doc A 1.3\nDoc B 2.2 - NO\n\nDoc C 3.1 - NO\nDoc C 3.2 - Doc B 2.1\n\n'

In [None]:
# Basic tokenizer model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Embedding Model - Simulated using tokenization
def generate_embedding(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Pass inputs through the model to get embeddings
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# test embedding
query = "What do elephants eat?"
embedding = generate_embedding(query)
print(embedding.shape)

(1, 768)


In [None]:
# Preprocess and create embeddings for each section
document_embeddings = {}
# The key of the above dictionary is "document name - section name"
for doc_name, doc_data in documents.items():
    for section_name, section_data in doc_data["sections"].items():
        content = section_data["content"]
        embedding = generate_embedding(content)
        document_embeddings[f"{doc_name} - {section_name}"] = {
            "embedding": embedding,
            "content": content,
            "references": section_data["references"]
        }

In [None]:
len(document_embeddings)

7

In [None]:
# Vector Retrieval Function
def retrieve_relevant_sections(query_embedding, k=2):
    embeddings = [v["embedding"] for v in document_embeddings.values()]
    section_names = list(document_embeddings.keys())

    # Calculate cosine similarity between query and document sections
    similarities = []
    for i in range(len(embeddings)):
      similarities.append(cosine_similarity(query_embedding, embeddings[i])[0][0])

    top_k_indices = np.array(similarities).argsort()[-k:][::-1]

    # Return the top-k relevant sections
    relevant_sections = [section_names[i] for i in top_k_indices]
    return relevant_sections

In [None]:
# testing Retrieval Function
query_embedding = generate_embedding(query)
retrieve_relevant_sections(query_embedding, k=2)

['Document A - Section 1.1', 'Document A - Section 1.2']

In [None]:
# Cross-Reference Expansion
def expand_with_references(relevant_sections):
    expanded_sections = set(relevant_sections)

    # Retrieve all referenced sections
    for section in relevant_sections:
        references = document_embeddings[section]["references"]

        for ref in references:
          if len(ref.split(", "))>1: # External reference
            expanded_sections.add(f'{ref.split(", ")[0]} - {ref.split(", ")[1]}')

          else:  # Same document reference
            doc_name = section.split(" - ")[0]
            expanded_sections.add(f"{doc_name} - {ref}")

    return list(expanded_sections)


In [None]:
expand_with_references(['Document A - Section 1.2'])

['Document A - Section 1.3', 'Document A - Section 1.2']

In [None]:
expand_with_references(['Document A - Section 1.3'])

['Document A - Section 1.3', 'Document B - Section 2.1']

In [None]:
# Answer Generation  - Placeholder for actual LLM, using string concatenation
def generate_answer(relevant_sections):
    content_pieces = []
    for section in relevant_sections:
        content_pieces.append(f"{section}: {document_embeddings[section]['content']}")

    return "\n\n".join(content_pieces)

In [None]:
def answer_user_query(query):

    print(f"User Query: {query}\n")

    # Step 1: Process query and generate embedding
    query_embedding = generate_embedding(query)

    # Step 2: Retrieve the most relevant sections
    relevant_sections = retrieve_relevant_sections(query_embedding, k=2)
    print(f"Top Relevant Sections: {relevant_sections}\n")

    # Step 3: Expand sections with cross-references
    expanded_sections = expand_with_references(relevant_sections)
    print(f"Expanded Sections (with references): {expanded_sections}\n")

    # Step 4: Generate the final answer
    answer = generate_answer(expanded_sections)

    print(f"Generated Answer:\n{answer}")


In [None]:
# Example 1
user_query = "What do elephants eat?"
answer_user_query(user_query)

User Query: What do elephants eat?

Top Relevant Sections: ['Document A - Section 1.1', 'Document A - Section 1.2']

Expanded Sections (with references): ['Document A - Section 1.3', 'Document A - Section 1.2', 'Document A - Section 1.1']

Generated Answer:
Document A - Section 1.3: Lions are carnivores that hunt herbivores like zebras and antelopes. For carnivorous birds, see Document B, Section 2.1.

Document A - Section 1.2: Elephants are herbivores, primarily eating grass, leaves, and fruits. For more information on carnivorous animals, refer to Section 1.3.

Document A - Section 1.1: Elephants are the largest land animals. They live in grasslands and forests, primarily in Africa and Asia.


In [None]:
# Example 2
user_query = "What can you tell me about dolphins?"
answer_user_query(user_query)

User Query: What can you tell me about dolphins?

Top Relevant Sections: ['Document C - Section 3.2', 'Document A - Section 1.2']

Expanded Sections (with references): ['Document A - Section 1.3', 'Document A - Section 1.2', 'Document B - Section 2.1', 'Document C - Section 3.2']

Generated Answer:
Document A - Section 1.3: Lions are carnivores that hunt herbivores like zebras and antelopes. For carnivorous birds, see Document B, Section 2.1.

Document A - Section 1.2: Elephants are herbivores, primarily eating grass, leaves, and fruits. For more information on carnivorous animals, refer to Section 1.3.

Document B - Section 2.1: Eagles are powerful carnivorous birds. They hunt small mammals, fish, and other birds. For more on carnivorous mammals, see Document A, Section 1.3.

Document C - Section 3.2: Dolphins primarily eat fish and squid. For details on carnivorous birds that hunt fish, refer to Document B, Section 2.1.
