In [13]:

from smolagents import OpenAIServerModel, CodeAgent, ToolCallingAgent, HfApiModel, tool, GradioUI
import gradio as gr
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
import fitz  
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
import os
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import time
import glob
import matplotlib.pyplot as plt
import numpy as np
from typing import Any, Dict, List

In [14]:
import logging


# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)




# Environment variables
reasoning_model_id = os.getenv("REASONING_MODEL_ID")
tool_model_id = os.getenv("TOOL_MODEL_ID")
huggingface_api_token = os.getenv("HUGGINGFACE_API_TOKEN")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
db_dir = r"/Users/aya/Desktop/2SCProject/VectorDB_embedding"

# Validate EMBEDDING_MODEL
if not EMBEDDING_MODEL:
    logger.error("EMBEDDING_MODEL environment variable is not set.")
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Fallback model
    logger.warning(f"Using fallback embedding model: {EMBEDDING_MODEL}")

# Initialize the embedding model
try:
    embedding_model = SentenceTransformer(EMBEDDING_MODEL)
    logger.info(f"Embedding model '{EMBEDDING_MODEL}' loaded successfully.")
except Exception as e:
    logger.error(f"Error loading embedding model: {e}", exc_info=True)
    raise RuntimeError(f"Failed to load embedding model: {e}")



ERROR:__main__:EMBEDDING_MODEL environment variable is not set.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.


In [15]:

def get_model(model_id: str) -> Any:
    """Retrieve the model based on the environment configuration."""
    using_huggingface = os.getenv("USE_HUGGINGFACE", "no").lower() == "yes"
    if using_huggingface:
        return HfApiModel(model_id=model_id, token=huggingface_api_token)
    else:
        if not model_id:
            raise ValueError("Model ID cannot be None or empty")
        return OpenAIServerModel(
            model_id=model_id,  
            api_base="http://localhost:11434/v1",
            api_key="ollama"
        )



In [16]:
# Après avoir chargé les variables d'environnement avec load_dotenv()
reasoning_model_id = os.getenv("REASONING_MODEL_ID")
tool_model_id = os.getenv("TOOL_MODEL_ID")

# Ajoutez ces vérifications avant d'initialiser les modèles
if not reasoning_model_id:
    logger.error("REASONING_MODEL_ID environment variable is not set.")
    reasoning_model_id = "mistral"  # ou "llama2" selon le modèle que vous avez pullé
    logger.warning(f"Using fallback reasoning model: {reasoning_model_id}")

if not tool_model_id:
    logger.error("TOOL_MODEL_ID environment variable is not set.")  
    tool_model_id = "mistral"  # ou "llama2" selon le modèle que vous avez pullé
    logger.warning(f"Using fallback tool model: {tool_model_id}")

# Ensuite, initialisez vos modèles
reasoning_model = get_model(reasoning_model_id)
reasoner = CodeAgent(tools=[], model=reasoning_model, add_base_tools=False, max_steps=2)

ERROR:__main__:REASONING_MODEL_ID environment variable is not set.
ERROR:__main__:TOOL_MODEL_ID environment variable is not set.


In [17]:
if not reasoning_model_id:
    reasoning_model_id = input("Enter the reasoning model ID (default: mistral): ") or "mistral"
    logger.info(f"Using reasoning model: {reasoning_model_id}")

In [18]:
# Initialize vector store and embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,  # Use the same model as SentenceTransformer
    model_kwargs={'device': 'cpu'}
)
client = chromadb.PersistentClient(path=db_dir)
collection = client.get_or_create_collection(name='ties_collection_emb', metadata={"hnsw:space": "cosine"})

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [19]:
try:
    embedding_model = SentenceTransformer(EMBEDDING_MODEL)
    logger.info(f"Embedding model '{EMBEDDING_MODEL}' loaded successfully.")
except Exception as e:
    logger.error(f"Error loading embedding model: {e}", exc_info=True)
    logger.warning("Using a fallback embedding model.")
    embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # Fallback model

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.


In [20]:
def rag_query(query):
    """Retrieves the most relevant document chunks from the vector database."""
    results = collection.query(query_texts=[query], n_results=3)

    if "documents" not in results or not results["documents"]:
        return []

    retrieved_docs = results["documents"][0]  # List of relevant chunk texts

    # Convert to document-like objects
    class Document:
        def __init__(self, content):
            self.page_content = content

    return [Document(doc) for doc in retrieved_docs]

### ---- PDF Processing and Chunking ----

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def process_pdf(file_path: str) -> str:
    try:
        print(f"Attempting to open PDF file: {file_path}")
        doc = fitz.open(file_path)  # Open the PDF file
        print("PDF file opened successfully.")

        text = ""
        for page_num, page in enumerate(doc):
            print(f"Processing page {page_num + 1}...")
            page_text = page.get_text("text")
            if not page_text:
                print(f"No text found on page {page_num + 1}.")
            text += page_text + "\n"
        print("Text extraction completed.")

        print("Splitting text into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=400,
            length_function=len,
        )
        chunks = text_splitter.split_text(text)
        print(f"Text split into {len(chunks)} chunks.")

        doc_id = os.path.basename(file_path)
        print(f"Storing chunks in the vector database for document: {doc_id}...")
        for i, chunk in enumerate(chunks):
            embedding = embeddings.embed_query(chunk)  # ✅ fixed here
            collection.add(
                ids=[f"{doc_id}_chunk_{i}"],
                documents=[chunk],
                embeddings=[embedding],
                metadatas=[{"source": doc_id, "chunk_id": i}]
            )
        print(f"Chunks stored successfully for document: {doc_id}.")

        return f"Processed {len(chunks)} chunks from {doc_id}."
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return "An error occurred while processing the PDF."


In [27]:
@tool
def rag_with_reasoner(user_query: str) -> str:
    """
    Queries the vector database and uses a reasoning LLM to summarize retrieved content.

    Args:
        user_query (str): The user's question for retrieving relevant document chunks.

    Returns:
        str: A concise summary based on the retrieved chunks.
    """
    docs = rag_query(user_query)

    if not docs:
        return "No relevant context found. Please refine your query."

    # Convert retrieved docs to formatted text
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = f"""
    You are a scientific paper summarizer. Be concise and specific.
    If there isn't sufficient information, suggest a better query.

    Context:
    {context}

    Question: {user_query}

    Answer:
    """

    # Get response from reasoning model
    response = reasoner.run(prompt, reset=False).split("</think>")[-1].strip()
    return response

In [28]:
def first_stage_retrieval(query: str, n_results: int = 5) -> List[Any]:
    """First stage retrieval: Get the most relevant document chunks from the vector database."""
    try:
        results = collection.query(query_texts=[query], n_results=n_results)
        if "documents" not in results or not results["documents"]:
            return []
        retrieved_docs = results["documents"][0]
        doc_ids = results["ids"][0]
        docs = []
        for i, doc_text in enumerate(retrieved_docs):
            doc = type('Document', (), {})()
            doc.page_content = doc_text
            doc.metadata = {"id": doc_ids[i]}
            docs.append(doc)
        return docs
    except Exception as e:
        logger.error(f"Error in first stage retrieval: {e}")
        return []

In [29]:
@tool
def basic_rag(user_query: str) -> str:
    """
    Simple RAG implementation using only first-stage retrieval.

    Args:
        user_query (str): The user's question for retrieving relevant document chunks.

    Returns:
        str: A concise summary based on the retrieved chunks.
    """
    try:
        docs = first_stage_retrieval(user_query, n_results=3)
        if not docs:
            return "No relevant context found. Please refine your query."

        context = "\n\n".join(doc.page_content for doc in docs)
        prompt = f"""
        You are a scientific paper summarizer. Be concise and specific.
        If there isn't sufficient information, suggest a better query.

        Context:
        {context}

        Question: {user_query}

        Answer:
        """

        # Get response from reasoning model
        response = reasoner.run(prompt, reset=False).split("</think>")[-1].strip()
        return response
    except Exception as e:
        logger.error(f"Error generating response: {e}", exc_info=True)
        return "An error occurred while generating the response."


In [30]:
tool_model = get_model(tool_model_id)
primary_agent = ToolCallingAgent(tools=[rag_with_reasoner], model=tool_model, add_base_tools=False, max_steps=5)

@tool
def basic_rag(user_query: str) -> str:
    """
    Simple RAG implementation using only first-stage retrieval.

    Args:
        user_query (str): The user's question for retrieving relevant document chunks.

    Returns:
        str: A concise summary based on the retrieved chunks.
    """
    try:
        docs = first_stage_retrieval(user_query, n_results=3)
        if not docs:
            return "No relevant context found. Please refine your query."

        context = "\n\n".join(doc.page_content for doc in docs)
        prompt = f"""
        You are a scientific paper summarizer. Be concise and specific.
        If there isn't sufficient information, suggest a better query.

        Context:
        {context}

        Question: {user_query}

        Answer:
        """

        # Get response from reasoning model
        response = reasoner.run(prompt, reset=False).split("</think>")[-1].strip()
        return response
    except Exception as e:
        logger.error(f"Error generating response: {e}", exc_info=True)
        return "An error occurred while generating the response."


def main():
    with gr.Blocks(theme=gr.themes.Soft(), title="📖 Paperly") as interface:
        gr.HTML("<link rel='icon' href='assets/paperly.png' type='image/png'>")
        gr.Markdown("# 📚 Agentic RAG for Scientific Papers")
        gr.Markdown("Upload a PDF and ask questions to retrieve key insights.")
        
        with gr.Row():
            with gr.Column():
                pdf_upload = gr.File(label="📄 Upload PDF", type="filepath")
                process_btn = gr.Button("📥 Process PDF")
                process_output = gr.Textbox(label="Processing Status", interactive=False)
                
                # Add RAG method selector
                rag_method = gr.Radio(
                    ["Basic RAG", "Agentic RAG"], 
                    label="🔄 RAG Method", 
                    value="Agentic RAG",
                    info="Choose between simple or agentic retrieval"
                )
                
            with gr.Column():
                user_input = gr.Textbox(label="🔍 Ask a Question", placeholder="Summarize this paper in 200 words...")
                submit_btn = gr.Button("🔎 Retrieve & Summarize")
                output_box = gr.Textbox(label="📜 Summary", interactive=False)
        
        process_btn.click(fn=process_pdf, inputs=pdf_upload, outputs=process_output)
        
        # Function to handle query based on selected method
        def handle_query(query, method):
            if method == "Basic RAG":
                return basic_rag(query)
            else:  # Agentic RAG
                return rag_with_reasoner(query)
                
        submit_btn.click(
            fn=handle_query, 
            inputs=[user_input, rag_method], 
            outputs=output_box
        )
        
        interface.launch(
            share=True,
            inbrowser=True,
            server_name="127.0.0.1",
            favicon_path="assets/paperly.png",
        )

if __name__ == "__main__":
    main()

INFO:httpx:HTTP Request: GET http://127.0.0.1:7861/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7861/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7861


INFO:httpx:HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://db21d864f3bfbd2ba9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO:httpx:HTTP Request: HEAD https://db21d864f3bfbd2ba9.gradio.live "HTTP/1.1 200 OK"


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Users/aya/Desktop/2SCProject/penv/lib/python3.13/site-packages/starlette/responses.py", line 341, in __call__
    stat_result = await anyio.to_thread.run_sync(os.stat, self.path)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aya/Desktop/2SCProject/penv/lib/python3.13/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        func, args, abandon_on_cancel=abandon_on_cancel, limiter=limiter
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/aya/Desktop/2SCProject/penv/lib/python3.13/site-packages/anyio/_backends/_asyncio.py", line 2461, in run_sync_in_worker_thread
    return await future
           ^^^^^^^^^^^^
  File "/Users/aya/Desktop/2SCProject/penv/lib/python3.13/site-packages/anyio/

--- MISTRAL and LLAMA2 ---


In [31]:
import os
import sys
import logging
from dotenv import load_dotenv
import chromadb
import PyPDF2
import ollama
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import EmbeddingFunction
from typing import List, Any, Union, Dict
import nltk
import re
import time
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize NLTK
nltk.download("punkt", quiet=True)

class ChromaCompatibleEmbeddingFunction(EmbeddingFunction):
    """Embedding function that strictly follows ChromaDB 0.4.16+ interface"""
    def __init__(self, embedding_model: Any):
        self.embedding_model = embedding_model
        
    def __call__(self, input: Union[List[str], str]) -> List[List[float]]:
        if isinstance(input, str):
            input = [input]
        
        if isinstance(self.embedding_model, SentenceTransformer):
            return self.embedding_model.encode(input).tolist()
        else:  # Ollama model
            embeddings = []
            for text in input:
                response = ollama.embeddings(model=self.embedding_model, prompt=text)
                embeddings.append(response["embedding"])
            return embeddings

class FastDocumentProcessor:
    def __init__(self):
        load_dotenv()
        self.config = self._load_config()
        self.embedding_model = self._init_embedding_model()
        self.client = chromadb.PersistentClient(path=self.config["db_dir"])
        self.collection_name = "ties_collection_fast"
        self.rouge = Rouge()
        
    def _load_config(self) -> dict:
        return {
            "reasoning_model": os.getenv("REASONING_MODEL_ID", "mistral"),
            "embedding_model": os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
            "db_dir": os.getenv("DB_DIR", "/Users/aya/Desktop/2SCProject/VectorDB_embedding")
        }
    
    def _init_embedding_model(self) -> Any:
        model_name = self.config["embedding_model"]
        try:
            if "sentence-transformers" in model_name:
                # Load the model once and cache it
                model = SentenceTransformer(model_name)
                logger.info(f"Loaded SentenceTransformer: {model_name}")
                return model
            else:
                # Just check if Ollama model exists
                ollama.show(model_name)
                logger.info(f"Using Ollama model: {model_name}")
                return model_name
        except Exception as e:
            logger.error(f"Failed to load model {model_name}: {e}")
            raise
    
    def get_embedding_function(self):
        return ChromaCompatibleEmbeddingFunction(self.embedding_model)
    
    def get_embedding_dimension(self) -> int:
        if isinstance(self.embedding_model, SentenceTransformer):
            return self.embedding_model.get_sentence_embedding_dimension()
        else:
            return {
                "nomic-embed-text": 768,
                "llama2": 4096,
                "mistral": 4096
            }.get(self.embedding_model, 768)
    
    def get_collection(self, recreate=False):
        """Get or create collection with option to recreate"""
        if recreate:
            try:
                self.client.delete_collection(self.collection_name)
                logger.info("Deleted existing collection")
            except:
                pass
        
        try:
            # Try to get existing collection first
            return self.client.get_collection(name=self.collection_name)
        except:
            # Create new collection if it doesn't exist
            return self.client.create_collection(
                name=self.collection_name,
                embedding_function=self.get_embedding_function(),
                metadata={
                    "hnsw:space": "cosine",
                    "dimension": str(self.get_embedding_dimension())
                }
            )
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract and quickly clean text from PDF"""
        logger.info(f"Opening PDF file: {pdf_path}")
        try:
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
                # Basic cleaning only - remove excess whitespace 
                return re.sub(r'\s+', ' ', text).strip()
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {e}")
            raise
    
    def store_documents(self, text: str, recreate=True):
        """Store document with larger batch size for speed"""
        collection = self.get_collection(recreate=recreate)
        
        # Split text into paragraphs instead of sentences for faster processing
        paragraphs = [p for p in text.split('\n\n') if p.strip()]
        
        # Use larger batch size for faster insertion
        batch_size = 50
        for i in range(0, len(paragraphs), batch_size):
            batch = paragraphs[i:i + batch_size]
            try:
                collection.add(
                    documents=batch,
                    ids=[f"doc_{i+j}" for j in range(len(batch))]
                )
            except Exception as e:
                logger.warning(f"Batch failed: {e}")
                # Quick retry with smaller batch
                half_batch = batch[:len(batch)//2]
                try:
                    collection.add(
                        documents=half_batch,
                        ids=[f"doc_{i+j}" for j in range(len(half_batch))]
                    )
                except:
                    pass
        
        logger.info(f"Stored {len(paragraphs)} paragraphs")
    
    def query_documents(self, query: str, n_results: int = 5) -> List[str]:
        """Retrieve relevant documents efficiently"""
        collection = self.get_collection()
        results = collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return results["documents"][0]
    
    def generate_summary(self, query: str, context: str) -> str:
        """Generate summary with optimized parameters for speed"""
        # More focused prompt for better summaries
        prompt = f"""Summarize this research paper extract in 3-5 sentences. Focus on:
        - Main research question
        - Key methods
        - Primary findings
        - Main conclusions
        
        PAPER EXTRACT:
        {context[:2000]}
        
        CONCISE SUMMARY:"""
        
        # Optimize for speed with lower num_predict and limited context
        response = ollama.generate(
            model=self.config["reasoning_model"],
            prompt=prompt,
            options={
                'temperature': 0.1,
                'num_ctx': 2048,
                'num_predict': 150,
                'top_p': 0.9,
                'top_k': 40
            }
        )
        return response['response']
    
    def evaluate_summary(self, reference: str, generated: str) -> Dict[str, float]:
        """Quick evaluation of summary quality"""
        try:
            # Calculate ROUGE scores
            rouge_scores = self.rouge.get_scores(generated, reference)[0]
            
            # Calculate BLEU score
            reference_tokens = [reference.split()]
            generated_tokens = generated.split()
            bleu_score = sentence_bleu(reference_tokens, generated_tokens)
            
            return {
                "bleu": bleu_score,
                "rouge-1": rouge_scores["rouge-1"]["f"],
                "rouge-2": rouge_scores["rouge-2"]["f"],
                "rouge-l": rouge_scores["rouge-l"]["f"]
            }
        except Exception as e:
            logger.error(f"Evaluation error: {e}")
            return {"error": str(e)}
    
    def run_pipeline(self, pdf_path: str, query: str, model: str = None) -> Dict:
        """Run the complete pipeline with timing"""
        start_time = time.time()
        
        # Set model if provided
        if model:
            self.config["reasoning_model"] = model
        
        # Extract and store
        pdf_text = self.extract_text_from_pdf(pdf_path)
        self.store_documents(pdf_text)
        
        # Query and generate
        context = " ".join(self.query_documents(query))
        summary = self.generate_summary(query, context)
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        return {
            "summary": summary,
            "execution_time": execution_time,
            "model": self.config["reasoning_model"]
        }
        
    def evaluate_models(self, summaries: Dict[str, str]) -> Dict[str, Dict[str, float]]:
        """
        Evaluate the similarity between model summaries by comparing each summary 
        with every other summary and averaging the results.
        """
        models = list(summaries.keys())
        results = {}
        
        # For each model, compare its summary to the average of all others
        for model in models:
            total_metrics = {"bleu": 0, "rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
            count = 0
            
            # Compare with each other model
            for other_model in models:
                if model != other_model:
                    metrics = self.evaluate_summary(summaries[other_model], summaries[model])
                    
                    # Add to totals
                    for key in total_metrics.keys():
                        if key in metrics:
                            total_metrics[key] += metrics[key]
                    count += 1
            
            # Calculate averages
            if count > 0:
                for key in total_metrics.keys():
                    total_metrics[key] /= count
                
                results[model] = total_metrics
                
        return results

def main():
    try:
        # Create processor
        processor = FastDocumentProcessor()
        
        # Default PDF path
        default_pdf_path = "/Users/aya/Desktop/2SCProject/PDFs_papers/biorxiv_19.pdf"
        
        # In Jupyter, sys.argv might contain notebook-specific arguments
        # so avoid using it directly and instead use a default path or allow
        # the path to be specified as a function parameter
        pdf_path = default_pdf_path
        
        # For demonstration purposes, you can replace the default path here
        # pdf_path = "/path/to/your/specific/paper.pdf"
        
        # Query
        query = "What are the key contributions and methods used?"
        
        # Run for multiple models
        results = {}
        summaries = {}
        
        for model in ["mistral", "llama2"]:
            logger.info(f"Running pipeline with {model}...")
            result = processor.run_pipeline(pdf_path, query, model)
            results[model] = result
            summaries[model] = result["summary"]
            
            print(f"\n=== Summary using {model.upper()} ===")
            print(f"Execution time: {result['execution_time']:.2f} seconds")
            print(result["summary"])
        
        # Compare model outputs
        print("\n=== Performance Evaluation (Model Cross-Comparison) ===")
        eval_results = processor.evaluate_models(summaries)
        
        for model, metrics in eval_results.items():
            print(f"🔷 {model.upper()} Model Performance:")
            print(f"🟢 BLEU: {metrics.get('bleu', 'N/A'):.4f}")
            print(f"🟢 ROUGE-1: {metrics.get('rouge-1', 'N/A'):.4f}")
            print(f"🟢 ROUGE-2: {metrics.get('rouge-2', 'N/A'):.4f}")
            print(f"🟢 ROUGE-L: {metrics.get('rouge-l', 'N/A'):.4f}")

    except Exception as e:
        logger.error(f"Pipeline failed: {e}", exc_info=True)

# Function to run with custom PDF path (for use in Jupyter)
def process_pdf(pdf_path, query="What are the key contributions and methods used?"):
    try:
        processor = FastDocumentProcessor()
        results = {}
        summaries = {}
        
        for model in ["mistral", "llama2"]:
            logger.info(f"Running pipeline with {model}...")
            result = processor.run_pipeline(pdf_path, query, model)
            results[model] = result
            summaries[model] = result["summary"]
            
            print(f"\n=== Summary using {model.upper()} ===")
            print(f"Execution time: {result['execution_time']:.2f} seconds")
            print(result["summary"])
            
        # Compare model outputs
        print("\n=== Performance Evaluation (Model Cross-Comparison) ===")
        eval_results = processor.evaluate_models(summaries)
        
        for model, metrics in eval_results.items():
            print(f"🔷 {model.upper()} Model Performance:")
            print(f"🟢 BLEU: {metrics.get('bleu', 'N/A'):.4f}")
            print(f"🟢 ROUGE-1: {metrics.get('rouge-1', 'N/A'):.4f}")
            print(f"🟢 ROUGE-2: {metrics.get('rouge-2', 'N/A'):.4f}")
            print(f"🟢 ROUGE-L: {metrics.get('rouge-l', 'N/A'):.4f}")
            
        return results
    except Exception as e:
        logger.error(f"Pipeline failed: {e}", exc_info=True)
        return None

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loaded SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Running pipeline with mistral...
INFO:__main__:Opening PDF file: /Users/aya/Desktop/2SCProject/PDFs_papers/biorxiv_19.pdf
INFO:__main__:Deleted existing collection
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.67it/s]
INFO:__main__:Stored 1 paragraphs
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Running pipeline with llama2...
INFO:__main__:Opening PDF file: /Users/aya/Desktop/2SCProject/PDFs_papers/biorxiv_19.pdf



=== Summary using MISTRAL ===
Execution time: 22.62 seconds
 The research paper focuses on the development of aiCRISPRL, an artificial intelligence platform for stem cell and organoid simulations with extensive gene editing capabilities. The main goal is to complement traditional wet-lab gene editing technologies by using AI to introduce specific mutations into genes, such as MutS homolog 2 (MSH2), for DNA Mismatch Repair (MMR) assessment. The study compares the CRISPR-Cas9 gene editing approach with the aiCRISPRL capabilities using artificial simulated HeLa cells (aiHeLa). The findings suggest that aiCRISPRL is an advanced AI platform technology capable of rapid prototyping and multiple scenario simulation in genomic research. This technology could


INFO:__main__:Deleted existing collection
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.55it/s]
INFO:__main__:Stored 1 paragraphs


Attempting to open PDF file: /private/var/folders/n7/rfrkxg1d1zndw_9rd6yvv96h0000gn/T/gradio/c3b0dca89a646a192f508ddb5d0f250ee374a6ae90518dfe62fa5743157b2b69/biorxiv_19.pdf
PDF file opened successfully.
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Text extraction completed.
Splitting text into chunks...
Text split into 88 chunks.
Storing chunks in the vector database for document: biorxiv_19.pdf...
Chunks stored successfully for document: biorxiv_19.pdf.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"



=== Summary using LLAMA2 ===
Execution time: 33.04 seconds

The main research question in this study is to compare the gene editing capabilities of CRISPR-Cas9 with those of an artificial intelligence platform called aiCRISPRL. The study evaluates the ability of aiCRISPRL to introduce specific mutations into the MSH2 gene, which is involved in DNA mismatch repair, and compares it with the CRISPR-Cas9 approach. The results suggest that aiCRISPRL is a powerful AI platform technology that can be used for rapid prototyping and multiple scenario simulation in genomic research to complement wet-lab based gene editing technologies.

The key methods used in this study include the use of artificial simulated HeLa cells (ai

=== Performance Evaluation (Model Cross-Comparison) ===
🔷 MISTRAL Model Performance:
🟢 BLEU: 0.2276
🟢 ROUGE-1: 0.6069
🟢 ROUGE-2: 0.3299
🟢 ROUGE-L: 0.5931
🔷 LLAMA2 Model Performance:
🟢 BLEU: 0.2276
🟢 ROUGE-1: 0.6069
🟢 ROUGE-2: 0.3299
🟢 ROUGE-L: 0.6069


In [33]:

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# Folder path containing PDFs
pdf_folder = r"/Users/aya/Desktop/2SCProject/PDFs_papers"

# Initialize document processor from your baseline code
processor = FastDocumentProcessor()

# Load your models
def agentic_rag_summary(query):
    return rag_with_reasoner(query)  # Uses your advanced RAG agent

def basic_rag_summary(query):
    return basic_rag(query)  # Uses your simple RAG implementation

def mistral_summary(query):
    # Use the FastDocumentProcessor's generate_summary with Mistral
    original_model = processor.config["reasoning_model"]
    processor.config["reasoning_model"] = "mistral"
    context = " ".join(processor.query_documents(query))
    summary = processor.generate_summary(query, context)
    processor.config["reasoning_model"] = original_model
    return summary

def llama2_summary(query):
    # Use the FastDocumentProcessor's generate_summary with Llama2
    original_model = processor.config["reasoning_model"]
    processor.config["reasoning_model"] = "llama2"
    context = " ".join(processor.query_documents(query))
    summary = processor.generate_summary(query, context)
    processor.config["reasoning_model"] = original_model
    return summary

# Extract text from the first 1000 characters (approx. abstract)
def extract_abstract(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text[:1000]  # First 1000 chars as ground truth summary

# Compute BLEU score
def compute_bleu(reference, candidate):
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    return sentence_bleu(reference_tokens, candidate_tokens)

# Compute ROUGE scores
def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)

# Compute METEOR score
def compute_meteor(reference, candidate):
    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())
    return meteor_score([ref_tokens], cand_tokens)

# Compute BERTScore
def compute_bertscore(reference, candidate):
    P, R, F1 = bert_score([candidate], [reference], lang='en', verbose=False)
    return {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'f1': F1.mean().item()
    }

# Find all PDFs in the folder
pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))

# Evaluate on all PDFs
# Store results for all metrics
bleu_scores = {
    "Agentic_RAG": [], 
    "Basic_RAG": [],
    "Mistral": [],
    "Llama2": []
}
rouge_scores = {
    "Agentic_RAG": {"rouge1": [], "rouge2": [], "rougeL": []}, 
    "Basic_RAG": {"rouge1": [], "rouge2": [], "rougeL": []},
    "Mistral": {"rouge1": [], "rouge2": [], "rougeL": []},
    "Llama2": {"rouge1": [], "rouge2": [], "rougeL": []}
}
meteor_scores = {
    "Agentic_RAG": [],
    "Basic_RAG": [],
    "Mistral": [],
    "Llama2": []
}
bert_scores = {
    "Agentic_RAG": {"precision": [], "recall": [], "f1": []},
    "Basic_RAG": {"precision": [], "recall": [], "f1": []},
    "Mistral": {"precision": [], "recall": [], "f1": []},
    "Llama2": {"precision": [], "recall": [], "f1": []}
}

# Evaluation loop
for pdf in pdf_files:
    print(f"Processing {pdf}...")
    
    # First process the PDF with the document processor
    pdf_text = processor.extract_text_from_pdf(pdf)
    processor.store_documents(pdf_text, recreate=True)

    # Extract reference summary
    reference_summary = extract_abstract(pdf)

    # summary generation
    query = "Summarize this paper in 200 words."
    
    # Get summaries from all four approaches
    agentic_summary = agentic_rag_summary(query)
    basic_rag_summary_text = basic_rag_summary(query)
    mistral_summary_text = mistral_summary(query)
    llama2_summary_text = llama2_summary(query)

    # Compute all metrics for each model
    for model_name, summary in [
        ("Agentic_RAG", agentic_summary),
        ("Basic_RAG", basic_rag_summary_text),
        ("Mistral", mistral_summary_text),
        ("Llama2", llama2_summary_text)
    ]:
        # BLEU
        bleu_score = compute_bleu(reference_summary, summary)
        bleu_scores[model_name].append(bleu_score)
        
        # ROUGE
        rouge_result = compute_rouge(reference_summary, summary)
        for key in ["rouge1", "rouge2", "rougeL"]:
            rouge_scores[model_name][key].append(rouge_result[key].fmeasure)
        
        # METEOR
        meteor_result = compute_meteor(reference_summary, summary)
        meteor_scores[model_name].append(meteor_result)
        
        # BERTScore
        bert_result = compute_bertscore(reference_summary, summary)
        bert_scores[model_name]["precision"].append(bert_result["precision"])
        bert_scores[model_name]["recall"].append(bert_result["recall"])
        bert_scores[model_name]["f1"].append(bert_result["f1"])

    time.sleep(2)  # Rate limiting

# Calculate average scores
results = {
    "Agentic_RAG": {
        "BLEU": bleu_scores["Agentic_RAG"],
        "ROUGE": rouge_scores["Agentic_RAG"],
        "METEOR": meteor_scores["Agentic_RAG"],
        "BERTScore": bert_scores["Agentic_RAG"]
    },
    "Basic_RAG": {
        "BLEU": bleu_scores["Basic_RAG"],
        "ROUGE": rouge_scores["Basic_RAG"],
        "METEOR": meteor_scores["Basic_RAG"],
        "BERTScore": bert_scores["Basic_RAG"]
    },
    "Mistral": {
        "BLEU": bleu_scores["Mistral"],
        "ROUGE": rouge_scores["Mistral"],
        "METEOR": meteor_scores["Mistral"],
        "BERTScore": bert_scores["Mistral"]
    },
    "Llama2": {
        "BLEU": bleu_scores["Llama2"],
        "ROUGE": rouge_scores["Llama2"],
        "METEOR": meteor_scores["Llama2"],
        "BERTScore": bert_scores["Llama2"]
    }
}

# Print final evaluation results
for model, scores in results.items():
    avg_bleu = sum(scores["BLEU"]) / len(scores["BLEU"]) if scores["BLEU"] else 0
    avg_rouge1 = sum(scores["ROUGE"]["rouge1"]) / len(scores["ROUGE"]["rouge1"]) if scores["ROUGE"]["rouge1"] else 0
    avg_rouge2 = sum(scores["ROUGE"]["rouge2"]) / len(scores["ROUGE"]["rouge2"]) if scores["ROUGE"]["rouge2"] else 0
    avg_rougeL = sum(scores["ROUGE"]["rougeL"]) / len(scores["ROUGE"]["rougeL"]) if scores["ROUGE"]["rougeL"] else 0
    avg_meteor = sum(scores["METEOR"]) / len(scores["METEOR"]) if scores["METEOR"] else 0
    avg_bert_f1 = sum(scores["BERTScore"]["f1"]) / len(scores["BERTScore"]["f1"]) if scores["BERTScore"]["f1"] else 0

    print(f"\n🔷 {model} Performance:")
    print(f"   🟢 Avg BLEU: {avg_bleu:.4f}")
    print(f"   🟢 Avg ROUGE-1: {avg_rouge1:.4f}")
    print(f"   🟢 Avg ROUGE-2: {avg_rouge2:.4f}")
    print(f"   🟢 Avg ROUGE-L: {avg_rougeL:.4f}")
    print(f"   🟢 Avg METEOR: {avg_meteor:.4f}")
    print(f"   🟢 Avg BERT-F1: {avg_bert_f1:.4f}")

# Visualization
metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERT-F1"]
x = np.arange(len(metrics))
width = 0.2  # Adjusted for 4 models

fig, ax = plt.subplots(figsize=(16, 7))
colors = ['royalblue', 'forestgreen', 'darkorange', 'purple']

for i, (model, color) in enumerate(zip(results.keys(), colors)):
    values = [
        sum(results[model]["BLEU"])/len(results[model]["BLEU"]),
        sum(results[model]["ROUGE"]["rouge1"])/len(results[model]["ROUGE"]["rouge1"]),
        sum(results[model]["ROUGE"]["rouge2"])/len(results[model]["ROUGE"]["rouge2"]),
        sum(results[model]["ROUGE"]["rougeL"])/len(results[model]["ROUGE"]["rougeL"]),
        sum(results[model]["METEOR"])/len(results[model]["METEOR"]),
        sum(results[model]["BERTScore"]["f1"])/len(results[model]["BERTScore"]["f1"])
    ]
    ax.bar(x + (i-1.5)*width, values, width, label=model, color=color)

ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('Model Comparison Across All Evaluation Metrics')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.ylim(0, 1)  # Most metrics are between 0-1
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

[nltk_data] Downloading package wordnet to /Users/aya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loaded SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Opening PDF file: /Users/aya/Desktop/2SCProject/PDFs_papers/biorxiv_19.pdf


Processing /Users/aya/Desktop/2SCProject/PDFs_papers/biorxiv_19.pdf...


INFO:__main__:Deleted existing collection
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.00it/s]
INFO:__main__:Stored 1 paragraphs


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were no


🔷 Agentic_RAG Performance:
   🟢 Avg BLEU: 0.0000
   🟢 Avg ROUGE-1: 0.3013
   🟢 Avg ROUGE-2: 0.0387
   🟢 Avg ROUGE-L: 0.1090
   🟢 Avg METEOR: 0.1922
   🟢 Avg BERT-F1: 0.8152

🔷 Basic_RAG Performance:
   🟢 Avg BLEU: 0.0000
   🟢 Avg ROUGE-1: 0.2966
   🟢 Avg ROUGE-2: 0.0383
   🟢 Avg ROUGE-L: 0.1369
   🟢 Avg METEOR: 0.1714
   🟢 Avg BERT-F1: 0.8172

🔷 Mistral Performance:
   🟢 Avg BLEU: 0.1113
   🟢 Avg ROUGE-1: 0.5000
   🟢 Avg ROUGE-2: 0.3058
   🟢 Avg ROUGE-L: 0.3525
   🟢 Avg METEOR: 0.3522
   🟢 Avg BERT-F1: 0.8516

🔷 Llama2 Performance:
   🟢 Avg BLEU: 0.0361
   🟢 Avg ROUGE-1: 0.4766
   🟢 Avg ROUGE-2: 0.1717
   🟢 Avg ROUGE-L: 0.1872
   🟢 Avg METEOR: 0.2557
   🟢 Avg BERT-F1: 0.8435


<Figure size 800x500 with 1 Axes>

<Figure size 1600x700 with 1 Axes>

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import time
import fitz  # PyMuPDF
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score

# Download required NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)  # Required for word_tokenize

# Initialize document processor
processor = FastDocumentProcessor()  # Assuming this class is already defined

# Define model summary generation functions
def agentic_rag_summary(query, processor):
    return rag_with_reasoner(query)  # Uses advanced RAG agent

def basic_rag_summary(query, processor):
    return basic_rag(query)  # Uses simple RAG implementation

def generate_model_summary(query, processor, model_name):
    """Generate a summary using specified model"""
    original_model = processor.config.get("reasoning_model")
    processor.config["reasoning_model"] = model_name.lower()
    context = " ".join(processor.query_documents(query))
    summary = processor.generate_summary(query, context)
    processor.config["reasoning_model"] = original_model
    return summary

# Extract text from PDF abstract (first 1000 characters)
def extract_abstract(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text[:1000].strip()  # First 1000 chars as ground truth summary
    except Exception as e:
        print(f"Error extracting abstract from {pdf_path}: {e}")
        return ""

# Evaluation metrics functions
def compute_bleu(reference, candidate):
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    return sentence_bleu(reference_tokens, candidate_tokens)

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)

def compute_meteor(reference, candidate):
    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())
    return meteor_score([ref_tokens], cand_tokens)

def compute_bertscore(reference, candidate):
    P, R, F1 = bert_score([candidate], [reference], lang='en', verbose=False)
    return {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'f1': F1.mean().item()
    }

def evaluate_models(pdf_folder, query="Summarize this paper in 200 words."):
    """Main evaluation function for all models on a folder of PDFs"""
    # Find all PDFs in the folder
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {pdf_folder}")
        return None
    
    print(f"Found {len(pdf_files)} PDF files for evaluation")
    
    # Define models to evaluate
    models = ["Agentic_RAG", "Basic_RAG", "Mistral", "Llama2"]
    
    # Initialize results storage
    results = {model: {
        "BLEU": [],
        "ROUGE": {"rouge1": [], "rouge2": [], "rougeL": []},
        "METEOR": [],
        "BERTScore": {"precision": [], "recall": [], "f1": []}
    } for model in models}
    
    # Process each PDF
    for i, pdf in enumerate(pdf_files):
        print(f"Processing {i+1}/{len(pdf_files)}: {os.path.basename(pdf)}...")
        
        try:
            # Process PDF with document processor
            pdf_text = processor.extract_text_from_pdf(pdf)
            processor.store_documents(pdf_text, recreate=True)
            
            # Extract reference summary (ground truth)
            reference_summary = extract_abstract(pdf)
            
            if not reference_summary:
                print(f"Warning: Could not extract reference summary from {pdf}")
                continue
                
            # Generate summaries for each model
            summaries = {
                "Agentic_RAG": agentic_rag_summary(query, processor),
                "Basic_RAG": basic_rag_summary(query, processor),
                "Mistral": generate_model_summary(query, processor, "mistral"),
                "Llama2": generate_model_summary(query, processor, "llama2")
            }
            
            # Evaluate each model's summary
            for model_name, summary in summaries.items():
                if not summary:
                    print(f"Warning: Empty summary for {model_name} on {pdf}")
                    continue
                    
                # BLEU
                bleu_score = compute_bleu(reference_summary, summary)
                results[model_name]["BLEU"].append(bleu_score)
                
                # ROUGE
                rouge_result = compute_rouge(reference_summary, summary)
                for key in ["rouge1", "rouge2", "rougeL"]:
                    results[model_name]["ROUGE"][key].append(rouge_result[key].fmeasure)
                
                # METEOR
                meteor_result = compute_meteor(reference_summary, summary)
                results[model_name]["METEOR"].append(meteor_result)
                
                # BERTScore
                bert_result = compute_bertscore(reference_summary, summary)
                for key in ["precision", "recall", "f1"]:
                    results[model_name]["BERTScore"][key].append(bert_result[key])
                    
            # Rate limiting to prevent API throttling
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {pdf}: {e}")
            continue
    
    # Return results for further analysis
    return results

def print_evaluation_results(results):
    """Print formatted evaluation results"""
    if not results:
        print("No results to display")
        return
        
    for model, scores in results.items():
        avg_bleu = np.mean(scores["BLEU"]) if scores["BLEU"] else 0
        avg_rouge1 = np.mean(scores["ROUGE"]["rouge1"]) if scores["ROUGE"]["rouge1"] else 0
        avg_rouge2 = np.mean(scores["ROUGE"]["rouge2"]) if scores["ROUGE"]["rouge2"] else 0
        avg_rougeL = np.mean(scores["ROUGE"]["rougeL"]) if scores["ROUGE"]["rougeL"] else 0
        avg_meteor = np.mean(scores["METEOR"]) if scores["METEOR"] else 0
        avg_bert_f1 = np.mean(scores["BERTScore"]["f1"]) if scores["BERTScore"]["f1"] else 0

        print(f"\n🔷 {model} Performance:")
        print(f"   🟢 Avg BLEU: {avg_bleu:.4f}")
        print(f"   🟢 Avg ROUGE-1: {avg_rouge1:.4f}")
        print(f"   🟢 Avg ROUGE-2: {avg_rouge2:.4f}")
        print(f"   🟢 Avg ROUGE-L: {avg_rougeL:.4f}")
        print(f"   🟢 Avg METEOR: {avg_meteor:.4f}")
        print(f"   🟢 Avg BERT-F1: {avg_bert_f1:.4f}")

def plot_comparison(results):
    """Create comparison plot of all models"""
    if not results:
        print("No results to plot")
        return
        
    metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERT-F1"]
    x = np.arange(len(metrics))
    width = 0.2  # Width for 4 models
    
    fig, ax = plt.subplots(figsize=(14, 8))
    colors = ['royalblue', 'darkorange', 'forestgreen', 'purple']
    
    all_bars = []
    for i, (model, color) in enumerate(zip(results.keys(), colors)):
        # Calculate average scores
        values = [
            np.mean(results[model]["BLEU"]),
            np.mean(results[model]["ROUGE"]["rouge1"]),
            np.mean(results[model]["ROUGE"]["rouge2"]),
            np.mean(results[model]["ROUGE"]["rougeL"]),
            np.mean(results[model]["METEOR"]),
            np.mean(results[model]["BERTScore"]["f1"])
        ]
        
        # Plot bars
        bars = ax.bar(x + (i-1.5)*width, values, width, label=model, color=color)
        all_bars.append(bars)
        
        # Add value labels
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f"{height:.3f}",
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=9,
                        rotation=90 if height < 0.1 else 0)
    
    # Set labels and styling
    ax.set_xlabel('Metrics', fontsize=12)
    ax.set_ylabel('Scores', fontsize=12)
    ax.set_title('Model Comparison Across All Evaluation Metrics', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, fontsize=11)
    ax.legend(fontsize=10)
    
    # Set y-axis limit
    plt.ylim(0, max([bar.get_height() for bars in all_bars for bar in bars]) + 0.1)
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    plt.tight_layout()
    
    return fig

# Main execution
if __name__ == "__main__":
    pdf_folder = r"/Users/aya/Desktop/2SCProject/PDFs_papers"
    results = evaluate_models(pdf_folder)
    
    if results:
        print_evaluation_results(results)
        fig = plot_comparison(results)
        plt.savefig("rag_model_comparison.png", dpi=300, bbox_inches='tight')
        plt.show()
    else:
        print("Evaluation failed or produced no results")

In [34]:
# Extract just two models for comparison
def plot_two_models_comparison(results, model1="Agentic_RAG", model2="Basic_RAG"):
    """Create comparison plot of two selected models"""
    metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERT-F1"]
    
    # Calculate average scores for model1
    model1_scores = [
        np.mean(results[model1]["BLEU"]),
        np.mean(results[model1]["ROUGE"]["rouge1"]),
        np.mean(results[model1]["ROUGE"]["rouge2"]),
        np.mean(results[model1]["ROUGE"]["rougeL"]),
        np.mean(results[model1]["METEOR"]),
        np.mean(results[model1]["BERTScore"]["f1"])
    ]
    
    # Calculate average scores for model2
    model2_scores = [
        np.mean(results[model2]["BLEU"]),
        np.mean(results[model2]["ROUGE"]["rouge1"]),
        np.mean(results[model2]["ROUGE"]["rouge2"]),
        np.mean(results[model2]["ROUGE"]["rougeL"]),
        np.mean(results[model2]["METEOR"]),
        np.mean(results[model2]["BERTScore"]["f1"])
    ]
    
    # X-axis positions
    x = np.arange(len(metrics))
    width = 0.35  # Width of bars
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 7))
    bars1 = ax.bar(x - width/2, model1_scores, width, label=model1, color="royalblue")
    bars2 = ax.bar(x + width/2, model2_scores, width, label=model2, color="darkorange")
    
    # Labels and title
    ax.set_xlabel("Metrics", fontsize=12)
    ax.set_ylabel("Scores", fontsize=12)
    ax.set_title(f"Comparison: {model1} vs. {model2}", fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, fontsize=11)
    ax.legend(fontsize=10)
    
    # Display values on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f"{height:.4f}",
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  # Offset text slightly above the bar
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=10)
    
    # Set y-axis limit
    plt.ylim(0, max(max(model1_scores), max(model2_scores)) + 0.05)
    plt.grid(axis="y", linestyle="--", alpha=0.6)
    plt.tight_layout()
    
    return fig

# Example usage
fig = plot_two_models_comparison(results, "Agentic_RAG", "Basic_RAG")
plt.savefig("two_rag_models_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

<Figure size 1200x700 with 1 Axes>

In [35]:
def plot_two_models_comparison(results, model1="Agentic_RAG", model2="Basic_RAG", display=True):
 
    """Create comparison plot of two selected models"""
    metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERT-F1"]
    
    # Calculate average scores for model1
    model1_scores = [
        np.mean(results[model1]["BLEU"]),
        np.mean(results[model1]["ROUGE"]["rouge1"]),
        np.mean(results[model1]["ROUGE"]["rouge2"]),
        np.mean(results[model1]["ROUGE"]["rougeL"]),
        np.mean(results[model1]["METEOR"]),
        np.mean(results[model1]["BERTScore"]["f1"])
    ]
    
    # Calculate average scores for model2
    model2_scores = [
        np.mean(results[model2]["BLEU"]),
        np.mean(results[model2]["ROUGE"]["rouge1"]),
        np.mean(results[model2]["ROUGE"]["rouge2"]),
        np.mean(results[model2]["ROUGE"]["rougeL"]),
        np.mean(results[model2]["METEOR"]),
        np.mean(results[model2]["BERTScore"]["f1"])
    ]
    
    # X-axis positions
    x = np.arange(len(metrics))
    width = 0.35  # Width of bars
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 7))
    bars1 = ax.bar(x - width/2, model1_scores, width, label=model1, color="royalblue")
    bars2 = ax.bar(x + width/2, model2_scores, width, label=model2, color="darkorange")
    
    # Labels and title
    ax.set_xlabel("Metrics", fontsize=12)
    ax.set_ylabel("Scores", fontsize=12)
    ax.set_title(f"Comparison: {model1} vs. {model2}", fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, fontsize=11)
    ax.legend(fontsize=10)
    
    # Display values on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f"{height:.4f}",
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  # Offset text slightly above the bar
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=10)
    
    # Set y-axis limit
    plt.ylim(0, max(max(model1_scores), max(model2_scores)) + 0.05)
    plt.grid(axis="y", linestyle="--", alpha=0.6)
    plt.tight_layout()
    
 


    
    if display:
        plt.show()
    
    return fig

In [42]:
from IPython.display import display
fig = plot_two_models_comparison(results, "Agentic_RAG", "Basic_RAG")
display(fig)

<Figure size 1200x700 with 1 Axes>

<Figure size 1200x700 with 1 Axes>