In [1]:
from pathlib import Path
import argparse
import logging
from typing import List
import torch
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import OllamaLLM
from sentence_transformers import CrossEncoder
import os
import gradio as gr

In [2]:
ROOT =Path(r"C:\Users\Archit\Documents\ML Projects\D-RAGon_System")
DATA_DIR = ROOT/'Data'
CHROMA_DIR = ROOT/'Chroma'

In [3]:
print(ROOT)
print(DATA_DIR)
print(CHROMA_DIR)

C:\Users\Archit\Documents\ML Projects\D-RAGon_System
C:\Users\Archit\Documents\ML Projects\D-RAGon_System\Data
C:\Users\Archit\Documents\ML Projects\D-RAGon_System\Chroma


In [5]:
def load_docs():
    doc_loader = PyPDFDirectoryLoader(DATA_DIR)
    return doc_loader.load()

In [6]:
def filter_pages(docs, min_chars = 200):
    cleaned = []
    blacklist = [
        "all rights reserved",
        "copyright",
        "isbn",
        "table of contents"
    ]
    for d in docs:
        text = d.page_content.lower()

        if len(text)<min_chars: #removes short pages
            continue
        if any(b in text for b in blacklist): # removes boilerplate pages
            continue
        cleaned.append(d)
    return cleaned

In [7]:
def split_docs(docs: List):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,)
    chunks = text_splitter.split_documents(docs)
    chunks = [c for c in chunks if len(c.page_content) > 200]
    return chunks

In [9]:
docs = load_docs()
docs = filter_pages(docs)

chunks = split_docs(docs)
print(chunks[0])

page_content='CONTENTS
INTRODUCTION
1. I SHOULD HAVE BEEN A STATISTIC
2. TRUTH HURTS
3. THE IMPOSSIBLE TASK
4. TAKING SOULS
5. ARMORED MIND
6. IT’S NOT ABOUT A TROPHY
7. THE MOST POWERFUL WEAPON
8. TALENT NOT REQUIRED
9. UNCOMMON AMONGST UNCOMMON
10. THE EMPOWERMENT OF FAILURE
11. WHAT IF?
ACKNOWLEDGMENTS
ABOUT THE AUTHOR' metadata={'producer': 'calibre (2.85.1) [https://calibre-ebook.com]', 'creator': 'calibre (2.85.1) [https://calibre-ebook.com]', 'creationdate': '2020-06-25T21:00:51+00:00', 'author': 'David Goggins', 'moddate': '2020-06-25T21:01:00+00:00', 'title': "Can't Hurt Me: Master Your Mind and Defy the Odds", 'source': 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Can_t-Hurt-Me-David-Goggins.pdf', 'total_pages': 303, 'page': 3, 'page_label': '4'}


In [10]:
print(len(docs))
print(len(chunks))

299
959


In [14]:
def calc_chunk_ids(chunks: List):
    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        chunk.metadata["id"] = chunk_id

    return chunks

In [8]:
def get_embeddings_function(device: str = None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    embeddings = HuggingFaceBgeEmbeddings(
        model_name = 'BAAI/bge-large-en-v1.5',
        model_kwargs={'device':'cuda'},
        encode_kwargs={'normalize_embeddings':True},
    )
    return embeddings 

In [15]:
def add_to_chroma(chunks: List):
    # load DB
    db = Chroma(persist_directory=CHROMA_DIR,
                embedding_function = get_embeddings_function())

    #Calc Page IDs 
    chunks_with_ids = calc_chunk_ids(chunks)

    # Add or update the documents
    existing_items = db.get(include=[])
    existing_ids = set(existing_items['ids'])
    print(f"Number of existing dicuments in DB: {len(existing_ids)}")

    # Only add docs that don't exist in the DB
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata['id'] not in existing_ids:
            new_chunks.append(chunk)
    
    if len(new_chunks):
        print(f'Adding new documents: {len(new_chunks)}')
        new_chunk_ids = [chunk.metadata['id'] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        #db.persist()
    else:
        print("No New Documents to add")

In [23]:
# Initialize Embeddings
emb_fxn = get_embeddings_function()

# Initialize Vector Store
db = Chroma(
    persist_directory=CHROMA_DIR,
    embedding_function=emb_fxn
)

# Initialize LLM
llm = OllamaLLM(model='llama3.1')

# Cross encoder
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: BAAI/bge-large-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [17]:
Base_PROMPT_strict = """
You must answer using ONLY the exact words from the context.

Rules:
- Do NOT explain.
- Do NOT rephrase.
- Do NOT add extra information.
- Return ONLY the answer phrase.

Context:
{context}

Question:
{question}

Answer:
"""

In [18]:
def rerank(query, docs, top_n=4):
    """
    query: string
    docs: list of (Document, score) from Chroma
    """

    passages = [doc.page_content for doc, _ in docs]
    pairs = [(query, passage) for passage in passages]

    scores = cross_encoder.predict(pairs)

    scored_docs = list(zip(docs, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)

    # return top_n docs in original (doc, score) format
    return [doc for (doc, _orig_score), _ce_score in scored_docs[:top_n]]

In [19]:
def query_rag(query_text: str, return_context=False):
    results = db.similarity_search_with_score(query_text, k=10)
  
    reranked_docs = rerank(query_text, results)
    
    context = "\n\n---\n\n".join([doc.page_content for doc in reranked_docs])

    prompt_template = ChatPromptTemplate.from_template(Base_PROMPT_strict)
    prompt = prompt_template.format(context=context, question=query_text)
    #print(prompt)
    
    response_text = llm.invoke(prompt)
    sources = [doc.metadata.get('id', None) for doc in reranked_docs]
    formatted_response = f'Response: {response_text}\n\nSources: {sources}'
    #print(formatted_response)
    if return_context:
        return response_text, context
    return(response_text)

In [24]:
query_rag("What is a RPN?")

'A fully-convolutional network that simultaneously predicts object bounds and ratios at that location.'

In [25]:
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logging.getLogger("chromadb").setLevel(logging.WARNING)
logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
logging.getLogger("langchain").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

In [26]:
CHAT_HISTORY = []

In [27]:
def reset_chat_history():
    CHAT_HISTORY.clear()
    print("Chat history reset.")

In [28]:
REWRITE_PROMPT = """
Given the chat history and the latest question, rewrite the question so it is standalone and can be understood without the history.

Chat history:
{history}

Latest question: {question}

Standalone question:
"""

In [29]:
def rewrite_query_with_history(query: str, history: list):
    if not history:
        return query

    model = OllamaLLM(model="llama3.1")

    history_text = "\n".join(f"{role}: {msg}" for role, msg in history[-6:])

    prompt = REWRITE_PROMPT.format(history=history_text,question=query,)
    #print(prompt)
    
    rewritten = model.invoke(prompt)
    return rewritten.strip()

In [30]:
Base_PROMPT_2  = """
Chat history:
{history}

Context:
{context}

Answer the question based on the above context

Question: {question}
"""

In [44]:
def query_rag_hist(query_text: str, history: list, return_context=False, return_sources=False):
    standalone_query = rewrite_query_with_history(query_text, history)
    
    results = db.similarity_search_with_score(standalone_query, k=10)
    reranked_docs = rerank(standalone_query, results)

    context = "\n\n---\n\n".join([doc.page_content for doc in reranked_docs])
    
    history_text = "\n".join(f"{role}: {msg}" for role, msg in history[-6:])
    prompt_template = ChatPromptTemplate.from_template(Base_PROMPT_2)
    prompt = prompt_template.format(context=context, question=query_text, history=history_text,)
    #print(prompt)

    response_text = llm.invoke(prompt)
    history.append(("user", query_text))
    history.append(("assistant", response_text))
    sources = [doc.metadata.get('id', None) for doc in reranked_docs]

    #print("Standalone query:", standalone_query)
    #print("\nResponse: ",response_text)
    #print("\nSources:", sources)
    if return_context and return_sources:
        return response_text, context, sources
    
    if return_sources:
        return response_text, sources
    
    if return_context:
        return response_text, context
    
    return response_text

In [55]:
query_rag_hist("What is an RPN?", CHAT_HISTORY)

INFO: Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
INFO: HTTP Request: HEAD https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
INFO: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/BAAI/bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/modules.json "HTTP/1.1 200 OK"
INFO: HTTP Request: HEAD https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
INFO: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/BAAI/bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/config_sentence_transformers.json "HTTP/1.1 200 OK"
INFO: HTTP Request: HEAD https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
INFO: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/BAAI/bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/confi

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-large-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
INFO: HTTP Request: HEAD https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/BAAI/bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/config.json "HTTP/1.1 200 OK"
INFO: HTTP Request: HEAD https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
INFO: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/BAAI/bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/tokenizer_config.json "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://huggingface.co/api/models/BAAI/bge-large-en-v1.5/tree

Standalone query: What is an RPN?

Response:  According to the abstract:

"An RPN (Region Proposal Network) is a fully-convolutional network that simultaneously predicts object bounds and objectness scores."

Sources: ['C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:3:6', 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:5:1', 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:7:2', 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:0:0']


'According to the abstract:\n\n"An RPN (Region Proposal Network) is a fully-convolutional network that simultaneously predicts object bounds and objectness scores."'

In [58]:
query_rag_hist("How does it differ from Fast R-CNN?", CHAT_HISTORY)

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-large-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Standalone query: How does an RPN (Region Proposal Network) differ from Fast R-CNN?

Response:  According to the abstract and the text, an RPN (Region Proposal Network) differs from Fast R-CNN in that:

* An RPN is a fully-convolutional network that simultaneously predicts object bounds and objectness scores at each position.
* It shares full-image convolutional features with the detection network, making the region proposal step "nearly cost-free".
* In contrast, Fast R-CNN relies on pre-computed region proposals, which are then used for detection.

Overall, an RPN is a more integrated approach that combines region proposal generation and object detection into a single end-to-end process.

Sources: ['C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:0:0', 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RCNN-Paper.pdf:6:1', 'C:\\Users\\Archit\\Documents\\ML Projects\\RAG-Based-PDF-QA-System\\Data\\Faster-RC

'According to the abstract and the text, an RPN (Region Proposal Network) differs from Fast R-CNN in that:\n\n* An RPN is a fully-convolutional network that simultaneously predicts object bounds and objectness scores at each position.\n* It shares full-image convolutional features with the detection network, making the region proposal step "nearly cost-free".\n* In contrast, Fast R-CNN relies on pre-computed region proposals, which are then used for detection.\n\nOverall, an RPN is a more integrated approach that combines region proposal generation and object detection into a single end-to-end process.'

In [32]:
def main():
    parser = argparse.ArgumentParser(description="Run PDF -> embeddings -> Chroma pipeline or query RAG system.")
    sub = parser.add_subparsers(dest="cmd")

    ingest = sub.add_parser("ingest", help="Load PDFs, split, and add to Chroma DB")
    #ingest.add_argument("--device", default="cuda", help="device for embeddings model (cpu or cuda)",)

    q = sub.add_parser("query", help="Run a RAG query")
    q.add_argument("--q", required=True, help="Question to ask")

    reset_chat = sub.add_parser("reset-chat", help="Reset the chat history")

    chat = sub.add_parser("chat", help="Run conversational RAG with history")
    #chat.add_argument("--q", required=True, help="Question to ask")
    # interactive chat, no --q arg needed, will prompt user for input

    info = sub.add_parser("info", help="Show counts of documents and chunks")

    args = parser.parse_args()

    if args.cmd == "ingest":
        logging.info(f"Loading PDFs from {DATA_DIR}")
        docs = load_docs()
        docs = filter_pages(docs)
        chunks = split_docs(docs)
        add_to_chroma(chunks)
        logging.info("Ingestion complete")

    elif args.cmd == "query":
        query_rag(args.q)
    
    elif args.cmd == "chat":
        print("Starting conversational RAG. Type 'exit' to quit.\n")
        while True:
            user_input = input("You: ")
            if user_input.lower() == "exit":
                break
            query_rag_hist(user_input, CHAT_HISTORY)

    elif args.cmd == "reset-chat":
        CHAT_HISTORY.clear()
        logging.info("Chat history reset")

    elif args.cmd == "info":
        docs = load_docs()
        docs = filter_pages(docs)
        chunks = split_docs(docs)
        print(f"Pages (after filter): {len(docs)}")
        print(f"Chunks: {len(chunks)}")

    else:
        parser.print_help()

In [None]:
#main()

In [None]:
#!python Code/pipeline.py ingest

In [None]:
#!python Code/pipeline.py query --q "what is RPN"

# UI - Gradio interface

In [33]:
def format_sources(sources):
    formatted = []
    for s in sources:
        try:
            path, page, chunk = s.rsplit(":", 2)
            filename = os.path.basename(path)
            formatted.append(f"• {filename} (page {page})")
        except Exception:
            formatted.append(f"• {s}")
    return "\n".join(formatted)

In [34]:
def chat_fn(message, history):
    response, sources = query_rag_hist(message, CHAT_HISTORY, return_sources=True)
    #sources_text = "\n".join([f"• {s}" for s in sources])
    sources_text = format_sources(sources)
    final_response = f"""{response}
---
### Sources
{sources_text}
"""
    return final_response

In [35]:
def upload_pdf(file):
    if file is None:
        return 'No file Uploaded'
    save_path = Path(DATA_DIR)/file.name
    shutil.copy(file.name, save_path)

    docs = load_docs()
    docs = filter_pages(docs)
    chunks = split(docs)
    add_to_chroma(chunks)

    return f'Indexed: {file.name}'

In [36]:
def reset_chat_ui():
    CHAT_HISTORY.clear()
    return '', "Chat reset successfully."

In [37]:
def list_documents():
    files = []

    for file in os.listdir(DATA_DIR):
        if file.endswith(".pdf"):
            files.append(file)

    if not files:
        return "No documents uploaded."

    return "\n".join(f"• {file}" for file in files)

In [1]:
with gr.Blocks() as demo:
    gr.Markdown("# Local RAG PDF QA System")

    with gr.Row():
        with gr.Column(scale = 3):
            chatbot = gr.ChatInterface(fn=chat_fn,
                                       title="Ask questions about your PDFs")
            
            gr.Markdown("### Example Questions")
            gr.Examples(examples = ["What was David Goggins pullup record?",
                                    "How many Hell Weeks did Goggins complete?",
                                    "What is Faster R-CNN?",
                                    "What is ROI pooling used for?",],
                                    inputs = chatbot.textbox)

        with gr.Column(scale=1):
            gr.Markdown("## Documents")
            doc_list = gr.Textbox(value=list_documents(), 
                                  label="Indexed Documents",
                                  interactive=False,
                                  lines=4)
            gr.Markdown("## Upload new PDF")
            file_upload = gr.File(file_types = ['.pdf'],label = 'Select PDF')
            upload_btn = gr.Button("Add Document")
            upload_status = gr.Textbox(label = 'Upload Status')
            
            reset_btn = gr.Button("Reset Chat", variant="secondary")
            reset_status = gr.Markdown()
            
    reset_btn.click(fn=reset_chat_ui, outputs=[chatbot.textbox, reset_status])
    upload_btn.click(fn=upload_pdf, inputs=file_upload, outputs=upload_status).then(fn=list_documents, outputs=doc_list)

demo.launch(inline = False, inbrowser=True)# share=True

NameError: name 'gr' is not defined