In [1]:
import warnings
warnings.filterwarnings('ignore')


In [7]:
from utils import get_openai_api_key

get_openai_api_key()
import os
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]  


In [8]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files= ["/Users/abhinay/Desktop/Real-Time_Detection_of_DNS_Exfiltration_and_Tunneling_from_Enterprise_Networks-1.pdf"]
).load_data()

In [9]:
from llama_index import Document

document = Document(text = "/n/n".join([doc.text for doc in documents]))

In [10]:
from llama_index.node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])


In [11]:
nodes = node_parser.get_nodes_from_documents([document])


In [13]:
from llama_index.node_parser import get_leaf_nodes

leaf_nodes = get_leaf_nodes(nodes)

print(leaf_nodes[30].text)

We note that some malicious
domains may appear among top KAlexa domains due to
a burst of requests from a high number of infected clients
querying them. For the benign training instances, we only
use top 10,000 primary domains. We also include FQDNs
for “ sophosxl.net ” domain (related to a benign anti-virus
application) which is not among the top 10K Majestic dataset.


In [14]:
nodes_by_id = {node.node_id: node for node in nodes}

parent_node = nodes_by_id[leaf_nodes[30].parent_node.node_id]

In [15]:
print(parent_node.text)

We note that some malicious
domains may appear among top KAlexa domains due to
a burst of requests from a high number of infected clients
querying them. For the benign training instances, we only
use top 10,000 primary domains. We also include FQDNs
for “ sophosxl.net ” domain (related to a benign anti-virus
application) which is not among the top 10K Majestic dataset.
C. Algorithms and Tuning Parameters
The objective is to maximize detection of anomalous queries
while reducing the rate of false alarms ( i.e.,incorrectly detect-
ing a normal query as anomalous, or vice versa). Many of su-
pervised machine-learning algorithms for detecting anomalies
such as one-class SVM and Replicator Neural Network suffer
from high false alarms since they are optimized for proﬁling
the inlier behavior rather than detecting anomalies. We employ
“Isolation Forest ( iForest )” [23] which is an effective algorithm
in detecting anomalous instances in high-dimensional datasets
with minimal memory and time c

In [16]:
from llama_index.llms import OpenAI

llm = OpenAI(model = "gpt-3.5-turbo", temperature=0.1)

In [17]:
from llama_index import ServiceContext

auto_merging_context = ServiceContext.from_defaults(
    llm=llm, 
    embed_model="local:BAAI/bge-small-en-v1.5",
    node_parser=node_parser,
)

In [19]:
from llama_index import StorageContext, VectorStoreIndex

storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

automerging_index = VectorStoreIndex(leaf_nodes, storage_context=storage_context,
                                     service_context=auto_merging_context)

automerging_index.storage_context.persist(persist_dir="./merging_index")

In [22]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.retrievers import AutoMergingRetriever
from llama_index.query_engine import RetrieverQueryEngine
from sympy import true

automerging_retriever = automerging_index.as_retriever(similarity_top_k = 12)

retriever = AutoMergingRetriever(automerging_retriever,
                                 automerging_index.storage_context,
                                 verbose=true)

rerank = SentenceTransformerRerank(top_n=6, 
                                   model="BAAI/bge-reranker-base")

auto_merging_engine = RetrieverQueryEngine.from_args(automerging_retriever,
                                                     node_postprocessors=[rerank])