# Qualitative Evaluation: RAG vs. Non-RAG Assistant

**Objective:** To perform a qualitative comparison between answers generated by a standard Large Language Model (Gemini 2.5 Pro) and the RAG-augmented assistant (`ask.py`).

**Methodology:**
This notebook will execute a series of predefined cybersecurity and CTF-related questions against two distinct pipelines:
1.  **Non-RAG:** The question is sent directly to the Gemini 2.5 Pro model without any external context.
2.  **RAG:** The question is processed by the `ask.py` pipeline, which first retrieves relevant context from the custom MongoDB/Vertex AI database and then injects that context into the prompt sent to the Gemini model.

The goal is not to assign a quantitative score, but to qualitatively observe the differences in the generated answers and to verify that the RAG pipeline is successfully retrieving relevant source documents. The "Sources Retrieved by RAG" for each query provides evidence that the retrieval mechanism is functioning as designed.


In [10]:
import os
import argparse
from dotenv import load_dotenv
import vertexai
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
from pymongo import MongoClient
import vertexai.generative_models as generative_models
import re
from datetime import datetime

# --- Configuration ---
# Ensure the .env file is in the root of the project directory
load_dotenv()
PROJECT_ID = os.getenv("GCP_PROJECT_ID")
LOCATION = "us-central1"
EMBEDDING_MODEL_NAME = "text-embedding-005"
DETAILED_ENDPOINT_NAME = os.getenv("DETAILED_ENDPOINT_NAME", "ctf-detailed-endpoint")
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017/")
DB_NAME = "ctf_writeups_db"
COLLECTION_NAME = "writeups"
GEMINI_MODEL_NAME = "gemini-2.5-pro" # Using the Pro model as decided

print("Configuration loaded.")

# --- Client Initialization ---
# Initialize clients once to avoid re-creation in loops
try:
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    mongo_client = MongoClient(MONGO_URI)
    embedding_model = TextEmbeddingModel.from_pretrained(EMBEDDING_MODEL_NAME)
    generative_model = generative_models.GenerativeModel(GEMINI_MODEL_NAME)
    print("Clients initialized successfully.")
except Exception as e:
    print(f"An error occurred during client initialization: {e}")


# --- Function Definitions (Adapted from ask.py) ---

def get_embedding(text: str) -> list[float]:
    """Generates a numerical vector embedding for a given text string."""
    embeddings = embedding_model.get_embeddings([text])
    return embeddings[0].values

def get_vector_search_neighbors(embedding: list[float], num_neighbors: int = 1) -> list[str]:
    """
    Performs a vector search to find the most similar document chunks.
    Note: We are now only retrieving the TOP 1 neighbor to test the single-context hypothesis.
    """
    try:
        endpoints = aiplatform.MatchingEngineIndexEndpoint.list(
            filter=f'display_name="{DETAILED_ENDPOINT_NAME}"',
            project=PROJECT_ID,
            location=LOCATION
        )
        if not endpoints:
            raise RuntimeError(f"Endpoint with display name '{DETAILED_ENDPOINT_NAME}' not found.")
        
        endpoint_resource_name = endpoints[0].resource_name
        endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name=endpoint_resource_name)
        deployed_index_id = endpoint.deployed_indexes[0].id
        
        response = endpoint.find_neighbors(
            deployed_index_id=deployed_index_id,
            queries=[embedding],
            num_neighbors=num_neighbors
        )
        
        if not response or not response[0]:
            return []
        
        return [neighbor.id for neighbor in response[0]]
    except Exception as e:
        print(f"Error in vector search: {e}")
        return []


def parse_and_deduplicate_ids(neighbor_ids: list[str]) -> list[str]:
    """Parses ctftime_id from chunk IDs and removes duplicates."""
    return list({neighbor_id.split('_')[0] for neighbor_id in neighbor_ids})

def fetch_documents_from_mongodb(doc_ids: list[str]) -> tuple[str, list[dict]]:
    """
    Fetches documents from MongoDB. Returns a clean context string containing only the
    AI-generated summaries and a list of source dictionaries for citation.
    """
    if not doc_ids:
        return "", []
    try:
        db = mongo_client[DB_NAME]
        collection = db[COLLECTION_NAME]
        documents = list(collection.find({"ctftime_id": {"$in": doc_ids}}))
        
        context_parts = []
        sources = []

        for doc in documents:
            # Build context for the LLM with the full text of the top write-up
            if doc.get('rewritten_full_text'):
                context_parts.append(doc['rewritten_full_text'])
            
            # Collect source information for citation
            sources.append({
                "id": doc.get('ctftime_id'),
                "title": doc.get('title', 'N/A'),
                "url": doc.get('url', 'N/A')
            })
            
        # Join summaries with a separator to distinguish them
        context = "\\n\\n---\\n\\n".join(context_parts)
        return context, sources
    except Exception as e:
        print(f"Error fetching from MongoDB: {e}")
        return "", []

def get_rag_answer(context: str, question: str) -> str:
    """Constructs a RAG prompt and calls the Gemini model."""
    prompt = f"""
Based on the following context from cybersecurity write-ups, provide a concise answer to the user's question.

Context:
---
{context}
---

Question: {question}
"""
    try:
        response = generative_model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating RAG answer: {e}"

def get_no_rag_answer(question: str) -> str:
    """Calls the Gemini model directly without any context."""
    try:
        response = generative_model.generate_content(question)
        return response.text
    except Exception as e:
        return f"Error generating non-RAG answer: {e}"


Configuration loaded.
Clients initialized successfully.


### Evaluation Queries

Here we define a list of questions to test the two systems. These questions are designed to be highly specific to the content within the RAG database, in order to clearly demonstrate the advantage of the context-augmented approach over a generic LLM.


In [11]:
evaluation_queries = [
    # Queries from 40147 (debug-2)
    "In the 'debug-2' pwn challenge, what is the purpose of the `case_swap` function and how does it corrupt the initial ROP payload before the stack pivot to the `.bss` section?",
    "For the 'debug-2' exploit, after leaking the libc base address using `puts`, what specific `one_gadget` offset was used to achieve a shell, and what were its constraints?",
    
    # Queries from 40255 (Broken Trust)
    "Explain the use-after-free vulnerability in the OP-TEE kernel's `crypto_hash_ctx` object in the 'Broken Trust' challenge. How was the `tee_ta_close_session` call used to trigger the UAF?",
    "In the 'Broken Trust' exploit, which virtual function table pointer was hijacked, and what specific `SMC` (Secure Monitor Call) handler was invoked to gain privileged execution?",

    # Queries from 40075 (G0tchaberg)
    "Describe the race condition in the Gotenberg PDF conversion service in the 'G0tchaberg' challenge. How were concurrent requests with different `waitDelay` parameters used to leak the temporary file path of the PDF?",
    
    # Queries from 40309 (dont_whisper)
    "In the 'dont_whisper' challenge, what specific adversarial technique, like using non-ASCII characters or homoglyphs, was used to bypass the initial filter and achieve command injection in the Whisper model's transcription output?",

    # Queries from 40024 (McFlagChecker)
    "In the 'McFlagChecker' Minecraft datapack challenge, what were the four distinct mathematical transformations applied to the player's input score, and in what order were they reversed to find the flag?",
    "What was the purpose of the Linear Congruential Generator (LCG) with its specific multiplier and increment values in the 'McFlagChecker' challenge, and how was its state reversed?"
]


### Execute Comparison

The following loop will now process each question. For each one, it will:
1.  Run the RAG pipeline to generate an answer and identify the source documents.
2.  Run the Non-RAG query to generate a baseline answer.
3.  Save the comparison to a unique Markdown file in a timestamped output directory for later analysis.


In [13]:
# Create a timestamped directory for this evaluation run
run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"evaluation_run_{run_timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"Saving evaluation output to: {output_dir}")

for i, query in enumerate(evaluation_queries):
    # --- RAG Pipeline ---
    question_embedding = get_embedding(query)
    neighbor_ids = get_vector_search_neighbors(question_embedding)
    unique_doc_ids = parse_and_deduplicate_ids(neighbor_ids)

    # --- Non-RAG Pipeline ---
    no_rag_answer = get_no_rag_answer(query)

    # --- Generate Formatted Output ---
    output_md = f"""
### Question: {query}
---
<br>

**RAG-Augmented Answer:**
"""
    if unique_doc_ids:
        context, sources = fetch_documents_from_mongodb(unique_doc_ids)
        rag_answer = get_rag_answer(context, query)
        output_md += f"\n{rag_answer}\n\n"
        output_md += "**Sources Retrieved by RAG:**\n"
        for source in sources:
            output_md += f"*   Document ID: `{source['id']}`\n"

        if sources:
            output_md += "\\n**For more details, you can read the full write-ups:**\\n"
            for source in sources:
                doc_id = source.get('id')
                title = source.get('title')
                url = source.get('url')

                # If the title is missing, create a sensible default.
                if not title or title == 'N/A':
                    title = f"Write-up for CTFtime ID {doc_id}"
                
                # If the URL is missing, construct it from the document ID.
                if not url or url == 'N/A':
                    url = f"http://ctftime.org/writeup/{doc_id}"

                output_md += f"*   [{title}]({url})\\n"
    else:
        output_md += "\\n*No relevant context was found in the database for this query.*\\n"

    output_md += f"""
<br>

---
**Standard LLM Answer (No RAG):**

{no_rag_answer}

---
"""
    
    # --- Save to File ---
    # Create a filesystem-safe slug from the query
    slug = re.sub(r'[^\w-]', '', query.lower().replace(' ', '_'))[:50]
    filename = f"{i+1:02d}_{slug}.md"
    filepath = os.path.join(output_dir, filename)

    try:
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(output_md)
        print(f"  - Saved: {filepath}")
    except Exception as e:
        print(f"  - Failed to save {filepath}: {e}")

print("\nEvaluation complete.")

Saving evaluation output to: evaluation_run_2025-07-14_15-05-28
  - Saved: evaluation_run_2025-07-14_15-05-28/01_in_the_debug-2_pwn_challenge_what_is_the_purpose_o.md
  - Saved: evaluation_run_2025-07-14_15-05-28/02_for_the_debug-2_exploit_after_leaking_the_libc_bas.md
  - Saved: evaluation_run_2025-07-14_15-05-28/03_explain_the_use-after-free_vulnerability_in_the_op.md
  - Saved: evaluation_run_2025-07-14_15-05-28/04_in_the_broken_trust_exploit_which_virtual_function.md
  - Saved: evaluation_run_2025-07-14_15-05-28/05_describe_the_race_condition_in_the_gotenberg_pdf_c.md
  - Saved: evaluation_run_2025-07-14_15-05-28/06_in_the_dont_whisper_challenge_what_specific_advers.md
  - Saved: evaluation_run_2025-07-14_15-05-28/07_in_the_mcflagchecker_minecraft_datapack_challenge_.md
  - Saved: evaluation_run_2025-07-14_15-05-28/08_what_was_the_purpose_of_the_linear_congruential_ge.md

Evaluation complete.
