<a href="https://colab.research.google.com/github/ArjavRD/RAG-Chatbot-for-Research-Papers/blob/main/Research_Paper_RAG_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Full RAG Chatbot with detailed comments

# Step 1: Imports
import requests
import xml.etree.ElementTree as ET
import numpy as np
import faiss
import fitz  # PyMuPDF for PDF reading
from sentence_transformers import SentenceTransformer

# Step 2: Function to search arXiv using free API
def search_arxiv(query, max_results=5):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
    response = requests.get(url)
    root = ET.fromstring(response.text)
    papers = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
        link = entry.find('{http://www.w3.org/2005/Atom}id').text.strip()
        papers.append({"title": title, "summary": summary, "link": link})
    return papers

# Step 3: Function to split long text into smaller chunks for processing
def chunk_text(text, size=500):
    return [text[i:i+size] for i in range(0, len(text), size)]

# Step 4: Function to read a PDF file and return all the text
def parse_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 5: Function to get sentence embeddings from a model
def get_embeddings(chunks, model):
    return model.encode(chunks)

# Step 6: Build a FAISS index for fast similarity search
def build_faiss_index(embeddings):
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

# Step 7: Retrieve the top-k relevant chunks given a query
def retrieve_top_chunks(query, chunks, index, model, top_k=3):
    q_vec = model.encode([query])
    D, I = index.search(np.array(q_vec), top_k)
    return [chunks[i] for i in I[0]]

# Step 8: Create a natural language prompt using retrieved context
def create_prompt(context_chunks, user_question):
    context = "\n\n".join(context_chunks)
    return f"Context:\n{context}\n\nQuestion: {user_question}\nAnswer:"

# Step 9: Print arXiv paper links for reference
def show_links(papers):
    print("\nSources:")
    for p in papers:
        print(f"- {p['title']}\n  {p['link']}\n")

# Step 10: Main function that runs the chatbot
def run_chatbot(query_topic, user_question, pdf_path=None):
    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Fetch papers
    print(f"Searching arXiv for: {query_topic}")
    arxiv_papers = search_arxiv(query_topic, max_results=5)
    arxiv_text = " ".join([f"{p['title']}: {p['summary']}" for p in arxiv_papers])
    arxiv_chunks = chunk_text(arxiv_text)

    # Add optional PDF content
    if pdf_path:
        print(f"Reading PDF: {pdf_path}")
        pdf_text = parse_pdf(pdf_path)
        pdf_chunks = chunk_text(pdf_text)
        all_chunks = arxiv_chunks + pdf_chunks
    else:
        all_chunks = arxiv_chunks

    # Embedding + indexing
    print("Generating embeddings and building vector index...")
    embeddings = get_embeddings(all_chunks, model)
    index = build_faiss_index(embeddings)

    # Retrieval
    print(f"Retrieving relevant context for: {user_question}")
    top_chunks = retrieve_top_chunks(user_question, all_chunks, index, model)
    prompt = create_prompt(top_chunks, user_question)

    # Output
    print("\n=== Prompt to send to an LLM ===\n")
    print(prompt)
    print("\n=== Source Papers ===")
    show_links(arxiv_papers)

# Ask user for input
def main():
    topic = input("Enter your topic of interest (e.g. graph neural networks): ")
    question = input("Enter your question: ")
    use_pdf = input("Do you want to include a PDF? (yes/no): ").lower()
    pdf_path = None
    if use_pdf == "yes":
        pdf_path = input("Enter path to the PDF file: ")
    run_chatbot(topic, question, pdf_path)

# Run the chatbot interactively
if __name__ == "__main__":
    main()


Enter your topic of interest (e.g. graph neural networks): stocks
Enter your question: analysis given a dataset
Do you want to include a PDF? (yes/no): no
Loading embedding model...
Searching arXiv for: stocks
Generating embeddings and building vector index...
Retrieving relevant context for: analysis given a dataset

=== Prompt to send to an LLM ===

Context:
ks. In this paper, we simulate
data-rich and data-poor fishery and survey data scenarios for a complex of
dover sole stocks. Simulated data for individual stocks were used to compare
estimation performance for single-stock and hierarchical multi-stock versions
of a Schaefer production model. The single-stock and best performing
multi-stock models were then used in stock assessments for the real dover sole
data. Multi-stock models often had lower estimation errors than single-stock
models when as

 targets at predicting
stock prices, less effort is made for profitable stock recommendation. Besides,
in existing approaches on modeli