Part A

In [1]:
!pip install -q datasets sentence-transformers faiss-cpu


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import torch
import faiss
import numpy as np
import pickle
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

Load datasets

In [21]:
from datasets import load_dataset


cve_dataset = load_dataset(
    "stasvinokur/cve-and-cwe-dataset-1999-2025",
    split="train"
)
cve_subset = cve_dataset.select(
    range(len(cve_dataset) - 200, len(cve_dataset))
)
cve_data = cve_subset.to_list()


persona_dataset = load_dataset(
    "nvidia/Nemotron-Personas-USA",
    split="train"
)
persona_data = persona_dataset.select(range(100)).to_list()

print("CVE:", len(cve_data), "Persona:", len(persona_data))


CVE: 200 Persona: 100


Build documents + metadata

In [45]:
documents = []
metadatas = []


for item in cve_data:
    doc = (
        f"TYPE: CVE\n"
        f"CVE_ID: {item['CVE-ID']}\n"
        f"CWE_ID: {item['CWE-ID']}\n"
        f"SEVERITY: {item['SEVERITY']}\n"
        f"DESCRIPTION: {item['DESCRIPTION']}"
    )
    documents.append(doc)
    metadatas.append({
        "type": "CVE",
        "cve_id": item["CVE-ID"]
    })


for item in persona_data:
    persona_text = item.get("persona") or item.get("text") or str(item)

    doc = (
        f"TYPE: PERSONA\n"
        f"PERSONA_DESCRIPTION: {persona_text}"
    )

    documents.append(doc)
    metadatas.append({
        "type": "PERSONA"
    })

print("Total docs:", len(documents))
print("\nSample PERSONA doc:\n", documents[-1])


Total docs: 300

Sample PERSONA doc:
 TYPE: PERSONA
PERSONA_DESCRIPTION: Julia Deleon blends a meticulous engineering mind with a quiet artistic soul, juggling eco‑focused career ambitions, a habit of collecting vintage drafting pens, and a love for solitary hikes that reset their creative circuits, and they often find themselves lost in detailed sketches late into the night.


Persona Documents

In [None]:
for item in persona_data:
    doc = (
        f"TYPE: PERSONA\n"
        f"NAME: {item['name']}\n"
        f"AGE: {item['age']}\n"
        f"OCCUPATION: {item['occupation']}\n"
        f"LOCATION: {item['location']}\n"
        f"BIO: {item['bio']}"
    )
    documents.append(doc)
    metadatas.append({"type": "PERSONA"})


In [46]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(
    documents,
    convert_to_numpy=True,
    show_progress_bar=True
)


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [47]:
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print("FAISS vectors:", index.ntotal)


FAISS vectors: 300


In [48]:
print("Sample stored document:\n")
print(documents[0])


Sample stored document:

TYPE: CVE
CVE_ID: CVE-2025-5006
CWE_ID: CWE-74
SEVERITY: MEDIUM
DESCRIPTION: A vulnerability was found in Campcodes Online Shopping Portal 1.0. It has been classified as critical. Affected is an unknown function of the file /admin/category.php. The manipulation of the argument Category leads to sql injection. It is possible to launch the attack remotely. The exploit has been disclosed to the public and may be used.


In [49]:
import pickle


faiss.write_index(index, "rag_index.faiss")


with open("rag_documents.pkl", "wb") as f:
    pickle.dump(documents, f)


with open("rag_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)


with open("rag_config.txt", "w") as f:
    f.write("embedding_model=all-MiniLM-L6-v2\n")

print(" ALL FILES SAVED SUCCESSFULLY")


 ALL FILES SAVED SUCCESSFULLY


Part B: RAG + LLM INTEGRATION & BENCHMARKING

In [28]:
!pip install -q faiss-cpu sentence-transformers transformers accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import torch


In [50]:
index = faiss.read_index("/kaggle/working/rag_index.faiss")


with open("/kaggle/working/rag_documents.pkl", "rb") as f:
    documents = pickle.load(f)


with open("/kaggle/working/rag_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

print("FAISS index loaded")
print("Documents loaded:", len(documents))


FAISS index loaded
Documents loaded: 300


In [51]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

LOAD LLM

In [53]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llm = llm.to(device)

print("LLM loaded on:", device)


LLM loaded on: cuda


In [54]:
conversation_memory = []
MAX_MEMORY = 3

RETRIEVAL FUNCTION

In [55]:
def retrieve_context(query, top_k=5):
    query_emb = embedder.encode([query])
    D, I = index.search(query_emb, top_k)

    retrieved_docs = []
    retrieved_meta = []

    for idx in I[0]:
        retrieved_docs.append(documents[idx])
        retrieved_meta.append(metadatas[idx])

    return retrieved_docs, retrieved_meta


RAG + LLM GENERATION

In [62]:
def rag_query(query):
   
    retrieved_docs, meta = retrieve_context(query)

    
    context = "\n\n".join(retrieved_docs)

 
    history = "\n".join(conversation_memory[-3:])

   
    prompt = f"""
You are a cybersecurity assistant.

RULES:
- Use ONLY the provided context
- Do NOT add new facts
- Rephrase clearly and professionally
- If the query is about CVE, answer using CVE information only
- If the query is about PERSONA, answer using persona information only

Conversation History:
{history}

Context:
{context}

Question:
{query}

Answer:
"""

   
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=200
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

   
    conversation_memory.append(f"Q: {query}\nA: {answer}")

    return answer


TEST QUERIES

In [63]:
print(rag_query("Explain CVE-2025-5006"))

TYPE: CVE CVE_ID: CVE-2025-5099 CWE_ID: CWE-119 SEVERITY: CRITICAL DESCRIPTION: An Out of Bounds Write occurs when the native library attempts PDF rendering, which can be exploited to achieve memory corruption and potentially arbitrary code execution.


In [64]:
print(rag_query("Tell me about a creative engineer persona"))

Julia Deleon blends a meticulous engineering mind with a quiet artistic soul, juggling ecofocused career ambitions, a habit of collecting vintage drafting pens, and a love for solitary hikes that reset their creative circuits, and they often find themselves lost in detailed sketches late into the night.


In [65]:
print(rag_query("Is there any CVE related to SQL injection?"))

TYPE: CVE CVE_ID: CVE-2025-5287 CWE_ID: CWE-89 SEVERITY: HIGH DESCRIPTION: The Likes and Dislikes Plugin plugin for WordPress is vulnerable to SQL Injection via the 'post' parameter in all versions up to, and including, 1.0.0 due to insufficient escaping on the user supplied parameter and lack of sufficient preparation on the existing SQL query. This makes it possible for unauthenticated attackers to append additional SQL queries into already existing queries that can be used to extract sensitive information from the database.


In [71]:
!pip install requests

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




BENCHMARK

In [89]:
import requests
import json

BASE_URL = "https://infosec.simpan.cv"
USERNAME = "AfifaMaryam"
NAME = "Afifa Maryam"

def obtain_benchmark():
    url = f"{BASE_URL}/obtain_benchmark"
    resp = requests.get(url, timeout=30)

    print("Status:", resp.status_code)
    print("Response Content-Type:", resp.headers.get("Content-Type"))
    print("Response Preview:")
    print(resp.text[:500])

    if "text/html" in resp.headers.get("Content-Type", ""):
        print("\nBenchmark endpoint returned HTML instead of JSON.")
        print("This indicates restricted or browser-only access.")
        return None

    try:
        data = resp.json()
        print("Benchmark prompt pairs received:", len(data))
        return data
    except Exception as e:
        print("JSON parsing failed:", str(e))
        return None

benchmark_data = obtain_benchmark()


Status: 200
Response Content-Type: text/html; charset=utf-8
Response Preview:
<!doctype html>
<html lang="en">
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
		<title>SimpanCV</title>
		<meta name="description" content="HR recruitment platform for managing candidate applications, assessments, and hiring workflows" />
		
		<link rel="modulepreload" href="/_app/immutable/entry/start.DkmDQbOX.js">
		<link rel="modulepreload" href="/_app/immutable/chunks/C92q9YN2.js">
		<link r

Benchmark endpoint returned HTML instead of JSON.
This indicates restricted or browser-only access.
