# RAG Pipeline

In [1]:
import os 
os.chdir("../")
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-AI-Cyber-Security-Assistant'

Load the Data

In [8]:
# Load MITRE ATT&CK Data
def load_mitre_data():
    file_paths = [
        "data/raw/mitre_tactics.txt",
        "data/raw/mitre_techniques.txt",
        "data/raw/mitre_mitigations.txt"
    ]

    all_texts = []

    for file_path in file_paths:
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="ISO-8859-1") as f:
                all_texts.extend(f.readlines())

        else:
            print(f"Warning: {file_path} not found!")

    return [text.strip() for text in all_texts if text.strip()]

documents = load_mitre_data()
documents

['Reconnaissance: The adversary is trying to gather information they can use to plan future operations.Reconnaissance consists of techniques that involve adversaries actively or passively gathering information that can be used to support targeting. Such information may include details of the victim organization, infrastructure, or staff/personnel. This information can be leveraged by the adversary to aid in other phases of the adversary lifecycle, such as using gathered information to plan and execute Initial Access, to scope and prioritize post-compromise objectives, or to drive and lead further Reconnaissance efforts.',
 'Resource Development: The adversary is trying to establish resources they can use to support operations.Resource Development consists of techniques that involve adversaries creating, purchasing, or compromising/stealing resources that can be used to support targeting. Such resources include infrastructure, accounts, or capabilities. These resources can be leveraged 

Load the Embedding Model and Convert Text to Embeddings

In [11]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
import numpy as np

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(documents, embedding=embedding_model)
vector_store.save_local("data/processed/faiss_index")
print("FAISS vector store saved successfully!")

FAISS vector store saved successfully!


### Testing Retriever

In [14]:
# Load FAISS Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.load_local("data/processed/faiss_index", embedding_model, allow_dangerous_deserialization=True)

def retrieve_relevant_docs(query):
    docs = vector_store.similarity_search(query, k=3)
    return [doc.page_content for doc in docs]

retrieve_relevant_docs("How can adversaries use token manipulation")

['Adversaries can steal application access tokens as a means of acquiring credentials to access remote systems and resources.',
 'Access Token Manipulation',
 'Adversaries can steal user application access tokens as a means of acquiring credentials to access remote systems and resources. This can occur through social engineering or URI hijacking and typically requires user action to grant access, such as through a system "Open With" dialogue.']