# Vector Database and Agentic RAG Setup

<img src="./agentic_rag_arch.png" width=600>

# Notebook architecture

<img src="./assets/notebook_Architecture.png" width=600>

# Agentic RAG Setup

In [1]:
import os
import glob
import fitz
from dotenv import load_dotenv
import chromadb
import numpy as np
from typing import List
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from evaluate import load
import matplotlib.pyplot as plt
import gradio as gr
from smolagents import OpenAIServerModel, CodeAgent, ToolCallingAgent, HfApiModel, GradioUI ,tool
from smolagents.tools import Tool
import requests
from typing import Dict, List, Union
import re

In [2]:

load_dotenv()

reasoning_model_id = os.getenv("REASONING_MODEL_ID")
tool_model_id = os.getenv("TOOL_MODEL_ID")
huggingface_api_token = os.getenv("HUGGINGFACE_API_TOKEN")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
db_dir = r"C:\Users\ACER NITRO\OneDrive\Bureau\Project 2SCI\VectorDB_Embeddings"
data_dir = "./data"
results_dir = "./results"
os.makedirs(results_dir, exist_ok=True)


In [3]:
def get_model(model_id):
    using_huggingface = os.getenv("USE_HUGGINGFACE", "yes").lower() == "yes"
    if using_huggingface:
        return HfApiModel(model_id=model_id, token=huggingface_api_token)
    else:
        return OpenAIServerModel(
            model_id=model_id,
            api_base="http://localhost:11434/v1",
            api_key="ollama"
        )

In [4]:
def ollama_chat(prompt, model=reasoning_model_id):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": model,
            "prompt": prompt,
            "stream": False  
        }
    )
    response.raise_for_status()
    result = response.json()
    return result['response']


### === Initialize Models === 

In [5]:
# Initialize models
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
reasoning_model = get_model(reasoning_model_id)
tool_model = get_model(tool_model_id)
reasoner = get_model(reasoning_model_id)

### === INIT ChromaDB ===

In [6]:
# Initialize ChromaDB
client = chromadb.PersistentClient(path=db_dir)
collection = client.get_or_create_collection(name='ties_collection_emb', metadata={"hnsw:space": "cosine"})

### ---- PDF Processing and Chunking ----

In [7]:

def process_scientific_text(text):
    # Remove references (e.g., [10], [15])
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove formulas (e.g., Lntp = ..., Lclass = ...)
    text = re.sub(r'L[a-zA-Z]+\s*=\s*[^=]+', '', text)
    
    # Remove table-related content (e.g., "TABLE 1. OVERVIEW OF THE DGA DATASET")
    text = re.sub(r'TABLE \d+\..*?\n', '', text, flags=re.IGNORECASE)
    
    # Remove numerical results (e.g., 97%, 0.7%, 1458863)
    text = re.sub(r'\b\d+%|\b\d+\.\d+%|\b\d{3,}\b', '', text)
    
    # Remove dataset-specific details (e.g., URLs, dataset sizes)
    text = re.sub(r'https?://[^\s]+', '', text)
    text = re.sub(r'\b\d+\s*(domains|records|samples|queries)\b', '', text, flags=re.IGNORECASE)
    
    # Remove lines with DGA dataset examples (e.g., zsvubwnqlefqv.com, xshellghost)
    text = re.sub(r'^\w+\s+\d+\s+[^\s]+\.[a-z]+$', '', text, flags=re.MULTILINE)
    
    # Remove empty lines and normalize whitespace, preserving section headings
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    text = '\n'.join(lines)  # Preserve line breaks for section headings
    
    return text

In [8]:
# PDF processing
def extract_and_chunk_pdf(file_path, chunk_size=800, chunk_overlap=400):
    """Extracts text from a PDF and splits into chunks."""
    doc = fitz.open(file_path)
    text = "\n".join([page.get_text("text") for page in doc])
    
    # Extract abstract (assuming it's the first paragraph or labeled)
    abstract = text.split("\n\n")[0] if "abstract" in text.lower() else ""
    
    # Chunk text
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    chunks = [process_scientific_text(chunk) for chunk in chunks]
    return chunks, abstract

def compute_embeddings(chunks):
    """ Computes embeddings for text chunks """
    return embedding_model.encode(chunks, convert_to_numpy=True)

def store_in_vector_db(chunks, file_path):
    """Stores chunks and embeddings in ChromaDB."""
    doc_id = os.path.basename(file_path)
    embeddings = compute_embeddings(chunks)
    
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            ids=[f"{doc_id}_chunk_{i}"],
            documents=[chunk],
            embeddings=[embedding.tolist()],
            metadatas=[{"source": doc_id, "chunk_id": i}]
        )
    
    return len(chunks)

In [9]:
chunks, abstract = extract_and_chunk_pdf(os.path.join(data_dir, "A Framework for Fine-Tuning LLMs using Heterogeneo.pdf"))

In [12]:
print(chunks[0])

A Framework for Fine-Tuning LLMs using Heterogeneous Feedback
Ryan Aponte 1, Ryan A. Rossi 2, Shunan Guo2, Franck Dernoncourt2,
Tong Yu2, Xiang Chen2, Subrata Mitra2, Nedim Lipka 2
1Carnegie Mellon University, 2Adobe Research
Abstract
Large language models (LLMs) have been
applied to a wide range of tasks, including
text summarization, web navigation, and chat-
bots. They have benefitted from supervised
fine-tuning (SFT) and reinforcement learning
from human feedback (RLHF) following an un-
supervised pretraining. These datasets can be
difficult to collect, limited in scope, and vary in
sample quality. Additionally, datasets can vary
extensively in supervision format, from numer-
ical to binary as well as multi-dimensional with
many different values. We present a framework
for fine-tuning LLMs using heterogeneous feed-
back, which has two main components. First,
we combine the heterogeneous feedback data
into a single supervision format, compatible
with methods like SFT and RLHF. Next,

In [13]:
print(abstract)

A Framework for Fine-Tuning LLMs using Heterogeneous Feedback
Ryan Aponte 1, Ryan A. Rossi 2, Shunan Guo2, Franck Dernoncourt2,
Tong Yu2, Xiang Chen2, Subrata Mitra2, Nedim Lipka 2
1Carnegie Mellon University, 2Adobe Research
Abstract
Large language models (LLMs) have been
applied to a wide range of tasks, including
text summarization, web navigation, and chat-
bots. They have benefitted from supervised
fine-tuning (SFT) and reinforcement learning
from human feedback (RLHF) following an un-
supervised pretraining. These datasets can be
difficult to collect, limited in scope, and vary in
sample quality. Additionally, datasets can vary
extensively in supervision format, from numer-
ical to binary as well as multi-dimensional with
many different values. We present a framework
for fine-tuning LLMs using heterogeneous feed-
back, which has two main components. First,
we combine the heterogeneous feedback data
into a single supervision format, compatible
with methods like SFT and RLHF. Next,

### ---- VectorDB querying ----

### === Extract Key Sections ===

In [10]:
def extract_sections(paper_text: str) -> str:
        """
        Extracts key sections from a scientific paper using regex.

        Args:
            paper_text (str): The full text of the scientific paper.

        Returns:
            str: A string containing the extracted sections formatted as "Section: Text".
        """
        sections = {}
        section_patterns = {
            "Abstract": r"(?i)^Abstract\n([\s\S]*?)(?=\n\n(?:Introduction|Methods|Results|Discussion|Conclusion|\Z))",
            "Introduction": r"(?i)^Introduction\n([\s\S]*?)(?=\n\n(?:Methods|Results|Discussion|Conclusion|\Z))",
            "Methods": r"(?i)^Methods\n([\s\S]*?)(?=\n\n(?:Results|Discussion|Conclusion|\Z))",
            "Results": r"(?i)^Results\n([\s\S]*?)(?=\n\n(?:Discussion|Conclusion|\Z))",
            "Discussion": r"(?i)^Discussion\n([\s\S]*?)(?=\n\n(?:Conclusion|\Z))",
            "Conclusion": r"(?i)^Conclusion\n([\s\S]*?)(?=\n\n|\Z)"
        }
        
        for section, pattern in section_patterns.items():
            match = re.search(pattern, paper_text, re.MULTILINE)
            if match:
                sections[section] = match.group(1).strip()
        
        if sections:
            sections_str = "\n\n".join([f"{key}:\n{value}" for key, value in sections.items()])
        else:
            sections_str = "Full Text:\n" + paper_text
        
        return sections_str

### ===Retrieval ====

In [11]:
def retrieve_similar_chunks(embeddings: np.ndarray, sections: str, top_k: int = 5) -> List[str]:
        """Retrieve top-k similar chunks from ChromaDB by matching each chunk separately."""
    
        client = chromadb.PersistentClient(path=db_dir)
        collection = client.get_or_create_collection(name='ties_collection_emb', metadata={"hnsw:space": "cosine"})
        doc_embedding = np.mean(embeddings, axis=0).tolist()
        results = collection.query(query_embeddings=[doc_embedding], n_results=top_k)
        chunks = results["documents"][0] if results["documents"] else []
    
        # Prioritize chunks from Abstract and Conclusion
        prioritized = []
        sections_lower = sections.lower()
        for chunk in chunks:
            if any(section in chunk.lower() for section in ["abstract", "conclusion"]) or any(section in sections_lower for section in ["abstract", "conclusion"]):
                prioritized.insert(0, chunk)
            else:
                prioritized.append(chunk)

        return prioritized[:top_k]


In [15]:
doc = fitz.open(data_dir + "/A Framework for Fine-Tuning LLMs using Heterogeneo.pdf")
text = "\n".join([page.get_text("text") for page in doc])
sections = extract_sections(text)
a=retrieve_similar_chunks(compute_embeddings(chunks),sections, top_k=5)
print(a[0:2])

['llm performance with self-guided data selection for\ninstruction tuning. Preprint, arXiv:2308.12032.\nReiichiro Nakano, Jacob Hilton, Suchir Balaji, Jeff Wu,\nLong Ouyang, Christina Kim, Christopher Hesse,\nShantanu Jain, Vineet Kosaraju, William Saunders,\nXu Jiang, Karl Cobbe, Tyna Eloundou, Gretchen\nKrueger, Kevin Button, Matthew Knight, Benjamin\nChess, and John Schulman. 2021. Webgpt: Browser-\nassisted question-answering with human feedback.\nCoRR, abs/2112.09332.\nLong Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Car-\nroll L. Wainwright, Pamela Mishkin, Chong Zhang,\nSandhini Agarwal, Katarina Slama, Alex Ray, John\nSchulman, Jacob Hilton, Fraser Kelton, Luke Miller,\nMaddie Simens, Amanda Askell, Peter Welinder,\nPaul Christiano, Jan Leike, and Ryan Lowe. 2022.\nTraining language models to follow instructions with\nhuman feedback. Preprint, arXiv:2203.02155.\nNils Reimers and Iryna Gurevych. 2019. Sentence-bert:\nSentence embeddings using siamese bert-networks.\nIn Proceedings

## Models


In [12]:
def baseline_model(pdf_path):
    """Baseline: Summarizes PDF chunks without RAG."""
    pdf_chunks = extract_and_chunk_pdf(pdf_path)[0]
    context = "\n".join(pdf_chunks) 
    prompt = f"""
    Summarize the following scientific paper content in 200 words:
    {context}
    """
    response = ollama_chat(prompt, model=reasoning_model_id)
    clean_response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()
    return clean_response

In [22]:
summary = baseline_model(os.path.join(data_dir, "A Framework for Fine-Tuning LLMs using Heterogeneo.pdf"))

In [23]:
print(summary)

The setup described for LLaMA-HD-0.2-S is a comprehensive approach to training an efficient model using advanced techniques:

1. **Model Architecture**: Utilizes LoRA with 16 dimensions, decomposing parameters into a low-rank (base) and random (noise) components for efficiency.

2. **Training Strategies**:
   - **SFT (Self-Attention Training)**: A fine-tuning phase focusing on attention patterns.
   - **Reward Model**: Enhances learning through structured feedback from interactions.
   - **RLHF (Reinforcement Learning with Human Feedback)**: Integrates human interaction for policy improvement.

3. **Data Setup**:
   - Utilizes OASST datasets, providing diverse tasks for generalization without retraining each instance.
   - Tasks like "What can I do in Miami" are structured to leverage task conversion systems (numerical → binary).

4. **Task Conversion**: Converts numerical data into formats suited for evaluation, simplifying tasks and improving consistency.

5. **Quantitative Metrics**

In [16]:
def classical_rag(pdf_path, user_query):
    """Classical RAG: Retrieves chunks based on query and summarizes."""
    pdf_chunks = extract_and_chunk_pdf(pdf_path)[0]
    sections = extract_sections("\n".join(pdf_chunks))
    results = retrieve_similar_chunks(compute_embeddings(pdf_chunks),sections, top_k=5)
    pdf_chunks = "\n".join(pdf_chunks[1::9])
    context = "\n".join(results) 
    prompt = f"""
You are a helpful and intelligent AI assistant specialized in summarizing scientific papers. Your task is to generate a comprehensive summary based on the provided PDF chunks, while enriching your response with relevant insights drawn from the retrieved contextual information.

Use the **PDF Context** as your primary source for the paper's content, and consult the **Retrieved Context** to enhance understanding, clarify technical terms, and provide broader perspective when necessary.

User Query:
{user_query}

PDF Context (extracted from the paper):
{pdf_chunks}

Retrieved Context (external relevant information):
{context}

Please ensure your summary directly addresses the user's query, remains grounded in the content of the PDF, and is enhanced—but not contradicted—by the retrieved context.
"""
    response = ollama_chat(prompt, model=reasoning_model_id)
    clean_response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()
    return clean_response


In [31]:
rag_summary = classical_rag(os.path.join(data_dir, "A Framework for Fine-Tuning LLMs using Heterogeneo.pdf"), "Summarize the paper in 200 words.")

In [32]:
print(rag_summary)

The framework described in the text leverages heterogeneous supervision by combining primary and secondary datasets to enhance fine-tuning of LLMs. Here's how it works:

1. **Primary Fine-Tuning Dataset**: This dataset consists of prompts with exactly two specific answers, represented as binary preference pairs. These are used for training a model that focuses on specific preferences.

2. **Secondary Fine-Tuning Dataset**: This dataset contains user-specific prompts with multiple responses, allowing the model to learn from different user contexts and priorities.

3. **Combining Datasets**: Both datasets are concatenated into a single feedback pool, enabling the model to access diverse perspectives of the data.

4. **Quality Selection**: The highest-quality pairs from each prompt are selected based on preference difference, ensuring that responses contribute uniquely to the model's learning.

5. **Diversity Selection**: Prompts with significant differences in preferences are prioritized

### === Agentic RAG ===

<img src="./assets/AgenticRAG.png" width=600>

In [17]:
@tool
def rag_with_reasoner(user_query: str, pdf_path: str) -> str:
    """
    Agentic RAG: Uses PDF chunks and retrieved context with reasoning.

    Args:
        user_query: The user query string.
        pdf_path: A list of text chunks from the PDF.
        pdf_embeddings: A numpy array containing the embeddings for the PDF chunks.

    Returns:
        A summary or answer generated by the reasoning agent.
    """
    # Create the reasoner for better RAG
    reasoning_model = get_model(reasoning_model_id)
    reasoner = CodeAgent(tools=[], model=reasoning_model, add_base_tools=False, max_steps=2)
    pdf_chunks= extract_and_chunk_pdf(pdf_path)[0]
    pdf_embeddings = compute_embeddings(pdf_chunks)
    sections = extract_sections("\n".join(pdf_chunks))
    retrieved_chunks = retrieve_similar_chunks(pdf_embeddings , sections , top_k=5)
    pdf_context = "\n".join(pdf_chunks[1::9]) # to avoid too long context
 
    prompt = f"""
    You are a scientific paper summarizer. Be concise and specific to help scientific community.
    Prioritize the uploaded document's content and use additional context only for enrichment.

    Uploaded Document:
    {pdf_context}

    Additional Context:
     f"Key Sections:\n{sections}\n\nPrioritized Chunks:\n{'\n'.join(retrieved_chunks)} 

    Query: {user_query}

     Answer:
     """
    response = reasoner.run(prompt, reset=False).split("</think>")[-1].strip()
    clean_response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()

    return clean_response

In [37]:
# test rag_with_reasoner
user_query = "Summarize the paper in 200 words."
pdf_path = os.path.join(data_dir, "A Framework for Fine-Tuning LLMs using Heterogeneo.pdf")
agentic_summary = rag_with_reasoner(user_query, pdf_path)

In [38]:
print(agentic_summary)

The paper explores the use of multiple datasets in fine-tuning language models, demonstrating flexibility and applicability across various tasks. Here's a structured summary:

### Theoretical Contributions:
1. **Multi-Task Learning**: Fine-tuning is achieved by leveraging diverse datasets, each providing different forms of feedback for varied tasks.
2. **Extracting Value from Feedback**: Datasets offer paired responses (primary) or single scores (secondary), allowing multi-priority fine-tuning.
3. **Leveraging Generic Patterns**: Through multiple datasets, the model can generalize knowledge without requiring precise annotations globally.

### Practical Applications:
1. **Toxicity Prediction**: A toy example illustrates learning toxicity prediction from diverse forms of feedback, suggesting generality in applicability beyond toxicology.
2. **Multiple Tasks**: Each prompt point offers multiple fine-grained parameters to learn contextually dependent tasks, enhancing overall model performa

# Evaluation

In [18]:
def evaluate_summary(generated_summary, reference_summary):
    """Evaluates a summary using ROUGE, BLEU, and BERTScore."""
    rouge = load("rouge")
    bleu = load("bleu")
    bertscore = load("bertscore")
    
    rouge_scores = rouge.compute(predictions=[generated_summary], references=[reference_summary])
    bleu_score = bleu.compute(predictions=[generated_summary], references=[reference_summary])
    bertscore_result = bertscore.compute(predictions=[generated_summary], references=[reference_summary], lang="en")
    
    return {
        "rouge1": rouge_scores["rouge1"],
        "rougeL": rouge_scores["rougeL"],
        "bleu": bleu_score["bleu"],
        "bertscore": bertscore_result["f1"][0]
    }

# Main pipeline
def process_and_evaluate(pdf_path, user_query="Summarize this paper in 200 words"):
    """Processes a PDF and evaluates Baseline, Classical-RAG, and AgenticRAG."""
    # Process PDF
    pdf_chunks, reference_summary = extract_and_chunk_pdf(pdf_path)
    
    # Run models
    baseline_summary = baseline_model(pdf_path)
    classical_summary = classical_rag(pdf_path, user_query)
    agentic_summary = rag_with_reasoner(user_query, pdf_path)
    # Evaluate
    if not reference_summary:
        reference_summary = pdf_chunks[0]  # Fallback to first chunk if no abstract
    
    metrics = {
        "Baseline": evaluate_summary(baseline_summary, reference_summary),
        "Classical-RAG": evaluate_summary(classical_summary, reference_summary),
        "AgenticRAG": evaluate_summary(agentic_summary, reference_summary)
    }
    
    return metrics, baseline_summary, classical_summary, agentic_summary

### === Batch Evaluation and Plot ===

In [19]:
def evaluate_all_papers():
    """Evaluates all PDFs in ./data and plots metrics as a bar chart."""
    pdf_files = glob.glob(os.path.join(data_dir, "*.pdf"))
    all_metrics = {}
    
    for pdf_path in pdf_files:
        doc_id = os.path.basename(pdf_path)
        metrics, _, _, _ = process_and_evaluate(pdf_path)
        all_metrics[doc_id] = metrics
    
    # Calculate average scores for each metric and model
    metrics_to_plot = ["rouge1", "rougeL", "bleu", "bertscore"]
    models = ["Baseline", "Classical-RAG", "AgenticRAG"]
    avg_scores = {model: [] for model in models}
    
    for metric in metrics_to_plot:
        for model in models:
            scores = [all_metrics[doc_id][model][metric] for doc_id in all_metrics]
            avg_score = np.mean(scores)
            avg_scores[model].append(avg_score)
    
    plt.figure(figsize=(10, 6))
    x = np.arange(len(metrics_to_plot))  # the label locations
    width = 0.25  # the width of the bars
    
    plt.bar(x - width, avg_scores["Baseline"], width, label="Baseline", color="lightgray")
    plt.bar(x, avg_scores["Classical-RAG"], width, label="Classical-RAG", color="lightblue", hatch='/')
    plt.bar(x + width, avg_scores["AgenticRAG"], width, label="AgenticRAG", color="skyblue")
    
    plt.xlabel("Metrics")
    plt.ylabel("Average Score (%)")
    plt.title("Average Metric Scores Across Papers")
    plt.xticks(x, [metric.upper() for metric in metrics_to_plot])
    plt.legend()
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    
    plt.savefig(os.path.join(results_dir, "metrics_comparison_bar.png"))
    plt.close()
    
    return all_metrics

### === Gradio Interface ===

In [20]:
def gradio_interface(pdf_upload, user_query):
    """Gradio interface for testing."""
    if pdf_upload:
        chunks, _ = extract_and_chunk_pdf(pdf_upload)
        embeddings = compute_embeddings(chunks)
        store_in_vector_db(chunks, pdf_upload)
        baseline_summary = baseline_model(pdf_path)
        classical_summary = classical_rag(pdf_path, user_query)
        agentic_summary = rag_with_reasoner(user_query, pdf_path)
        
        return baseline_summary, classical_summary, agentic_summary
    return "Please upload a PDF.", "", ""

with gr.Blocks(theme=gr.themes.Soft(), title="📖 Paperly") as interface:
    gr.Markdown("# 📚 Agentic RAG for Scientific Papers")
    gr.Markdown("Upload a PDF and ask questions to retrieve key insights.")
    
    with gr.Row():
        with gr.Column():
            pdf_upload = gr.File(label="📄 Upload PDF", type="filepath")
            user_input = gr.Textbox(label="🔍 Ask a Question", placeholder="Summarize this paper in 200 words...")
            submit_btn = gr.Button("🔎 Retrieve & Summarize")
        
        with gr.Column():
            baseline_output = gr.Textbox(label="Baseline Summary", interactive=False)
            classical_output = gr.Textbox(label="Classical RAG Summary", interactive=False)
            agentic_output = gr.Textbox(label="Agentic RAG Summary", interactive=False)
    
    submit_btn.click(
        fn=gradio_interface,
        inputs=[pdf_upload, user_input],
        outputs=[baseline_output, classical_output, agentic_output]
    )

In [21]:
if __name__ == "__main__":
    # Run batch evaluation
    metrics = evaluate_all_papers()
    print("Evaluation completed. Plots saved in", results_dir)
    
    # Launch Gradio interface
    interface.launch(
        share=True,
        inbrowser=True,
        server_name="127.0.0.1",
        favicon_path="assets/paperly.png"
    )

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation completed. Plots saved in ./results
* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


In [57]:
evaluate_all_papers(metrics)

{'A Framework for Fine-Tuning LLMs using Heterogeneo.pdf': {'Baseline': {'rouge1': 0.2621,
   'rougeL': 0.1069,
   'bertscore': 0.7802},
  'Classical-RAG': {'rouge1': 0.2446, 'rougeL': 0.1138, 'bertscore': 0.8039},
  'AgenticRAG': {'rouge1': 0.2692, 'rougeL': 0.1174, 'bertscore': 0.8123},
  'GraphRAG': {'rouge1': 0.3917, 'rougeL': 0.1294, 'bertscore': 0.8498}},
 'Evaluate Fine-tuning Strategies for Fetal Head Ult.pdf': {'Baseline': {'rouge1': 0.3108,
   'rougeL': 0.1239,
   'bertscore': 0.8129},
  'Classical-RAG': {'rouge1': 0.1642, 'rougeL': 0.085, 'bertscore': 0.8286},
  'AgenticRAG': {'rouge1': 0.2865, 'rougeL': 0.1302, 'bertscore': 0.8305},
  'GraphRAG': {'rouge1': 0.3432, 'rougeL': 0.1431, 'bertscore': 0.8377}},
 'Fine-tuning and aligning question answering models.pdf': {'Baseline': {'rouge1': 0.2386,
   'rougeL': 0.0981,
   'bertscore': 0.7985},
  'Classical-RAG': {'rouge1': 0.0943, 'rougeL': 0.058, 'bertscore': 0.7508},
  'AgenticRAG': {'rouge1': 0.2124, 'rougeL': 0.0941, 'berts

# REPLACEMENT CODE

### ===IN CASE OF RECALCULATION OF VECTOR DB WITH ANOTHER MODEL ===

In [None]:
# Recompute embeddings for all papers
for pdf_path in glob.glob("./data/*.pdf"):
    chunks, _ = extract_and_chunk_pdf(pdf_path)
    embeddings = embedding_model.encode(chunks)  # Using allenai/specter
    store_in_vector_db(chunks, pdf_path)

collection = client.get_or_create_collection(name='specter_collection_emb', metadata={"hnsw:space": "cosine"})


# Static Workflow Utilities

In [28]:
class WorkflowUtils:
    @staticmethod
    def extract_sections(paper_text: str) -> str:
        """
        Extracts key sections from a scientific paper using regex.

        Args:
            paper_text (str): The full text of the scientific paper.

        Returns:
            str: A string containing the extracted sections formatted as "Section: Text".
        """
        sections = {}
        section_patterns = {
            "Abstract": r"(?i)^Abstract\n([\s\S]*?)(?=\n\n(?:Introduction|Methods|Results|Discussion|Conclusion|\Z))",
            "Introduction": r"(?i)^Introduction\n([\s\S]*?)(?=\n\n(?:Methods|Results|Discussion|Conclusion|\Z))",
            "Methods": r"(?i)^Methods\n([\s\S]*?)(?=\n\n(?:Results|Discussion|Conclusion|\Z))",
            "Results": r"(?i)^Results\n([\s\S]*?)(?=\n\n(?:Discussion|Conclusion|\Z))",
            "Discussion": r"(?i)^Discussion\n([\s\S]*?)(?=\n\n(?:Conclusion|\Z))",
            "Conclusion": r"(?i)^Conclusion\n([\s\S]*?)(?=\n\n|\Z)"
        }
        
        for section, pattern in section_patterns.items():
            match = re.search(pattern, paper_text, re.MULTILINE)
            if match:
                sections[section] = match.group(1).strip()
        
        if sections:
            sections_str = "\n\n".join([f"{key}:\n{value}" for key, value in sections.items()])
        else:
            sections_str = "Full Text:\n" + paper_text
        
        return sections_str

    @staticmethod
    def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 400) -> List[str]:
        """
        Chunks text using RecursiveCharacterTextSplitter.

        Args:
            text (str): The text to be split into chunks.
            chunk_size (int, optional): The maximum size of each chunk. Defaults to 800.
            chunk_overlap (int, optional): The overlap between chunks. Defaults to 400.

        Returns:
            List[str]: A list of text chunks.
        """
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        return text_splitter.split_text(text)

    @staticmethod
    def compute_embeddings(chunks: List[str]) -> np.ndarray:
        """Computes embeddings for text chunks."""
        return embedding_model.encode(chunks, convert_to_numpy=True)

    @staticmethod
    def store_in_vector_db(chunks: List[str], embeddings: np.ndarray, file_path: str) -> None:
        """Stores chunks and embeddings in ChromaDB."""
        client = chromadb.PersistentClient(path=db_dir)
        collection = client.get_or_create_collection(name='ties_collection_emb', metadata={"hnsw:space": "cosine"})
        doc_id = os.path.basename(file_path)
        
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            collection.add(
                ids=[f"{doc_id}_chunk_{i}"],
                documents=[chunk],
                embeddings=[embedding.tolist()],
                metadatas=[{"source": doc_id, "chunk_id": i}]
            )

    @staticmethod
    def retrieve_and_prioritize_chunks(embeddings: np.ndarray, sections: str, top_k: int = 5) -> List[str]:
        """Retrieves top-k similar chunks from ChromaDB and prioritizes chunks from Abstract and Conclusion."""
    
    
        # Retrieve similar chunks from ChromaDB
        client = chromadb.PersistentClient(path=db_dir)
        collection = client.get_or_create_collection(name='ties_collection_emb', metadata={"hnsw:space": "cosine"})
        doc_embedding = np.mean(embeddings, axis=0).tolist()
        results = collection.query(query_embeddings=[doc_embedding], n_results=top_k)
        chunks = results["documents"][0] if results["documents"] else []
    
        # Prioritize chunks from Abstract and Conclusion
        prioritized = []
        sections_lower = sections.lower()
        for chunk in chunks:
            if any(section in chunk.lower() for section in ["abstract", "conclusion"]) or any(section in sections_lower for section in ["abstract", "conclusion"]):
                prioritized.insert(0, chunk)
            else:
                prioritized.append(chunk)

        return prioritized[:top_k]
    @staticmethod
    def generate_summary(context: str, user_query: str, reasoning_agent: CodeAgent) -> str:
        """
        Generates a summary based on context and query using the reasoning agent.

        Args:
            context (str): The context from sections and chunks.
            user_query (str): The user's query (e.g., "Summarize in 200 words").
            reasoning_agent (CodeAgent): The reasoning agent for summary generation.

        Returns:
            str: The generated summary.
        """
        prompt = f"""
        You are a scientific paper summarizer. Generate a concise summary (200 words) based on the provided context.
        Focus on key findings, contributions, and conclusions. Avoid introducing external information.
        
        Context:
        {context}
        
        Query: {user_query}
        
        Summary:
        """
        result = reasoning_agent.run(prompt, reset=False)
        if hasattr(result, "content"):
            result = result.content
        return result.split("</think>")[-1].strip()

    @staticmethod
    def fact_check(summary: str, original_text: str) -> Dict[str, Union[str, set]]:
        """
        Validates summary against original text by comparing terms.

        Args:
            summary (str): The summary to validate.
            original_text (str): The original paper text.

        Returns:
            Dict[str, Union[str, set]]: Dictionary with the revised summary and any missing terms.
        """
        summary_terms = set(summary.lower().split())
        original_terms = set(original_text.lower().split())
        missing_terms = summary_terms - original_terms
        if missing_terms:
            return {
                "summary": f"Warning: Terms {missing_terms} not found in original text. Revised summary:\n{summary}",
                "missing_terms": missing_terms
            }
        return {"summary": summary, "missing_terms": set()}