In [75]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_chroma import Chroma
import ollama
import shutil
import psutil
import stat
import os

In [76]:
chroma_path = "chroma"
directory_path = "C:/Users/aryan/Documents/LLMs/LLM-PDF-Reader/Documents" # Replace this

In [77]:
def load_documents(directory_path):
    document_loader = PyPDFDirectoryLoader(directory_path)
    return document_loader.load()

In [78]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function = len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

In [79]:
def get_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [80]:
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0
    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        filename = os.path.basename(source) if source else "unknown"
        current_page_id = f"{filename}:{page}"
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
        last_page_id = current_page_id
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        chunk.metadata["id"] = chunk_id

In [81]:
def add_to_db(chunks: list[Document], persist_directory: str):
    db = Chroma(persist_directory=persist_directory, embedding_function=get_embeddings())

    # Assign unique IDs to chunks
    calculate_chunk_ids(chunks)  

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of Existing Chunks in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"Number of New Chunks Added: {len(new_chunks)}")

        # Extract IDs from metadata
        chunk_ids = [chunk.metadata["id"] for chunk in chunks]

        # Add documents with IDs
        db.add_documents(chunks, ids=chunk_ids)
        
        # # Persist the database
        # db.persist()
    else:
        print("No new chunks to add to the database.")

In [82]:
def clear_database(persist_directory: str, reset=False):
    
    if not reset:
        return
    # Load the vector database (ChromaDB) from langchain_chroma
    db = Chroma(persist_directory=persist_directory, embedding_function=get_embeddings())

    # Clear the collection
    db.delete_collection()  # Clears the entire collection
    print("Database cleared.")

In [83]:
def query_rag(query: str, persist_directory: str):
    # Load the vector database (ChromaDB)
    db = Chroma(persist_directory=persist_directory, embedding_function=get_embeddings())

    # Search the DB.
    results = db.similarity_search_with_score(query, k=5)

    PROMPT_TEMPLATE = """
    You are an AI assistant. Use the provided context to answer the question accurately and concisely.

    Context:
    {context}

    If the context does not contain relevant information, respond with "I don't know."

    Question:
    {question}

    Answer:
    """

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query)
    # print(prompt)

    model = Ollama(model="mistral")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    
    return response_text

In [84]:
documents = load_documents(directory_path)
chunks = split_documents(documents)

In [85]:
clear_database(chroma_path, False)

In [86]:
print(chunks[1])

page_content='unintended exposure could lead to severe conse-
quences, including breaches of private and sen-
sitive information. This paper presents a black-
box attack to force a RAG system to leak its
private knowledge base which, differently from
existing approaches, is adaptive and automatic.
A relevance-based mechanism and an attacker-
side open-source LLM favor the generation of
effective queries to leak most of the (hidden)
knowledge base. Extensive experimentation
proves the quality of the proposed algorithm
in different RAG pipelines and domains, com-
paring to very recent related approaches, which
turn out to be either not fully black-box, not
adaptive, or not based on open-source models.
The findings from our study remark the urgent
need for more robust privacy safeguards in the' metadata={'source': 'C:\\Users\\aryan\\Documents\\LLMs\\LLM-PDF-Reader\\Documents\\Pirates_of_the_RAG.pdf', 'page': 0, 'page_label': '1'}


In [87]:
calculate_chunk_ids(chunks)
id_example = chunks[5].metadata["id"]
print(id_example)

Pirates_of_the_RAG.pdf:0:5


In [88]:
embeddings = get_embeddings()
vector = embeddings.embed_query("Hello, world!")  # Generate an embedding
print(vector[:5])

[-0.038177136331796646, 0.0329110249876976, -0.005459396634250879, 0.01436989288777113, -0.04029098153114319]


In [89]:
add_to_db(chunks, chroma_path)

Number of Existing Chunks in DB: 136
No new chunks to add to the database.


In [90]:
query_rag("Explain RAG", chroma_path)

Response:  RAG (Retrieval-Augmented Generation) is a system that enhances a language model's ability to generate responses by providing it with additional knowledge from a predefined database, often referred to as the knowledge base. This database can contain sensitive information and must be kept confidential to ensure privacy and security. RAG systems can be used in various applications such as customer support assistants, workflow streamlining within organizations, or medical support chatbots. The proposed algorithm allows a user with open-source tools to craft attacks on RAG systems, highlighting the need for more robust safeguards in their design. It's important to note that a RAG system is essentially an LLM (Language Learning Model) that generates text based on input prompts and retrieved information.
Sources: ['Pirates_of_the_RAG.pdf:1:1', 'Pirates_of_the_RAG.pdf:0:3', 'Pirates_of_the_RAG.pdf:9:1', 'Pirates_of_the_RAG.pdf:3:5', 'Pirates_of_the_RAG.pdf:1:6']


" RAG (Retrieval-Augmented Generation) is a system that enhances a language model's ability to generate responses by providing it with additional knowledge from a predefined database, often referred to as the knowledge base. This database can contain sensitive information and must be kept confidential to ensure privacy and security. RAG systems can be used in various applications such as customer support assistants, workflow streamlining within organizations, or medical support chatbots. The proposed algorithm allows a user with open-source tools to craft attacks on RAG systems, highlighting the need for more robust safeguards in their design. It's important to note that a RAG system is essentially an LLM (Language Learning Model) that generates text based on input prompts and retrieved information."