# RAG Pipeline

In [1]:
import os 
os.chdir("../")
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-AI-Cyber-Security-Assistant'

Load the Data and Chunk

In [2]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_mitre_data():
    loader = DirectoryLoader("data/raw", glob="*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "ISO-8859-1"})
    docs = loader.load()
    docs_text = [doc.page_content for doc in docs]
    return docs_text

In [4]:
docs = load_mitre_data()

In [5]:
def document_chunks(docs, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.create_documents(docs)


In [6]:
chunks = document_chunks(docs)
chunks

[Document(metadata={}, page_content='Account Use Policies: Configure features related to account use like login attempt lockouts, specific login times, etc.\nActive Directory Configuration: Implement robust Active Directory configurations using group policies to control access and reduce the attack surface. Specific examples include:\nAntivirus/Antimalware: Use signatures or heuristics to detect malicious software.'),
 Document(metadata={}, page_content='Application Developer Guidance: This mitigation describes any guidance or training given to developers of applications to avoid introducing security weaknesses that an adversary may be able to take advantage of.\nApplication Isolation and Sandboxing: Restrict execution of code to a virtual environment on or in transit to an endpoint system.\nAudit: Perform audits or scans of systems, permissions, insecure software, insecure configurations, etc. to identify potential weaknesses.'),
 Document(metadata={}, page_content='Behavior Preventio

In [7]:
len(chunks)

245

Load the Embedding Model and Convert Text to Embeddings

In [8]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
import numpy as np

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embedding=embedding_model)
vector_store.save_local("data/processed/faiss_index")
print("FAISS vector store saved successfully!")

FAISS vector store saved successfully!


### Testing Retriever

In [9]:
# Load FAISS Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.load_local("data/processed/faiss_index", embedding_model, allow_dangerous_deserialization=True)

def retrieve_relevant_docs(query):
    docs = vector_store.similarity_search(query, k=3)
    return [doc.page_content for doc in docs]

retrieve_relevant_docs("Drive-by compromise")

["Drive-by Compromise: Adversaries may gain access to a system during a drive-by compromise, when a user visits a website as part of a regular browsing session. With this technique, the user's web browser is targeted and exploited simply by visiting the compromised website.",
 'Supply Chain Compromise: Adversaries may perform supply chain compromise to gain control systems environment access by means of infected products, software, and workflows. Supply chain compromise is the manipulation of products, such as devices or software, or their delivery mechanisms before receipt by the end consumer. Adversary compromise of these products and mechanisms is done for the goal of data or system compromise, once infected products are introduced to the target environment.',
 'Wireless Compromise: Adversaries may perform wireless compromise as a method of gaining communications and unauthorized access to a wireless network. Access to a wireless network may be gained through the compromise of a wir

In [19]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import os

In [21]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Load FAISS Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.load_local("data/processed/faiss_index", embedding_model, allow_dangerous_deserialization=True)

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

# Define prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "query"],
    template="You are a cybersecurity expert. Given the context below, answer the user's question:\n\nContext:\n{context}\n\nQuery:\n{query}\n\nResponse:"
)

llm_chain = prompt_template | llm

def retrieve_and_generate_response(query):
    docs = vector_store.similarity_search(query, k=5)
    context = "\n".join([doc.page_content for doc in docs])
    
    # Generate response using LLM
    response = llm_chain.invoke({"context": context, "query": query})
    print(response)
    
    return response

In [22]:
retrieve_and_generate_response("What is drive-by compromise?")

content="Drive-by compromise is a technique used by adversaries to gain access to a system when a user visits a website as part of their regular browsing session. In this method, the user's web browser is targeted and exploited simply by visiting a compromised website. This can lead to the unauthorized access of the user's system and potentially result in data or system compromise." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 71, 'prompt_tokens': 290, 'total_tokens': 361, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-3ed21754-1113-4e7e-8aed-69faf0f02067-0' usage_metadata={'input_tokens': 290, 'output_tokens': 71, 'total_tokens': 361, 'input_token_details': {'audio': 0,

AIMessage(content="Drive-by compromise is a technique used by adversaries to gain access to a system when a user visits a website as part of their regular browsing session. In this method, the user's web browser is targeted and exploited simply by visiting a compromised website. This can lead to the unauthorized access of the user's system and potentially result in data or system compromise.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 71, 'prompt_tokens': 290, 'total_tokens': 361, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-3ed21754-1113-4e7e-8aed-69faf0f02067-0', usage_metadata={'input_tokens': 290, 'output_tokens': 71, 'total_tokens': 361, 'input_token_details'