# Create ATT&CK Groups Vector Database
---
* Collaborators:
    * Roberto Rodriguez (@Cyb3rWard0g)
* References:
    * https://python.langchain.com/en/latest/modules/indexes/getting_started.html
    * https://www.youtube.com/watch?v=eqOfr4AGLk8

## Define Variables

In [None]:
import os

# Define a few variables
current_directory = os.path.dirname("__file__")
documents_directory = os.path.join(current_directory, "documents")

## Load Documents

In [None]:
import glob
from langchain.document_loaders import UnstructuredMarkdownLoader

# variables
group_files = glob.glob(os.path.join(documents_directory, "*.md"))

# Loading Markdown files
md_docs = []
print("[+] Loading Group markdown files..")
for group in group_files:
    print(f' [*] Loading {os.path.basename(group)}')
    loader = UnstructuredMarkdownLoader(group)
    md_docs.extend(loader.load())

## Split Documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # number of tokens overlap between chunks
    separators=['\n\n', '\n', ' ', '']
)

chunks = text_splitter.split_documents(md_docs)

## Add Chunks to Vector Database

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(
    model_name="all-mpnet-base-v2"
)

# load it into Chroma and save it to disk
vectorstore = Chroma.from_documents(
    chunks,
    embedding_function,
    collection_name="groups_collection",
    persist_directory="./chroma_db"
)

## Test Similarity Search

In [None]:
# query it
query = "What threat actors send text messages to their targets?"
relevant_docs = vectorstore.similarity_search(query)

# print results
print(relevant_docs[0].page_content)

## Initialize Retriever

In [None]:
retriever = vectorstore.as_retriever()