In [5]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
from torch import nn
from tqdm.auto import tqdm
from DLAIUtils import Utils
import torch
import time
import os
from dotenv import load_dotenv, find_dotenv
import openai
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

In [6]:
INDEX_NAME = f'dl-ai-{openai.api_key[-36:].lower().replace("_", "-")}'

pinecone = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=256, metric='cosine', spec=ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=768)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device=device)
device

'cpu'

In [13]:
train_examples = []
with open("./data/training.txt", 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            a, b, label = line.split("^")
            train_examples.append(InputExample(texts=[a, b], label = float(label)))

warmup_steps = 100
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

In [16]:
import pickle

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=16, warmup_steps=100)

samples = []
with open('data/sample.log', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            samples.append(line)

Epoch:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
emb = model.encode(samples)

In [19]:
prepped = []
for i in tqdm(range(len(samples))):
  v = {'id':f'{i}', 'values':emb[i].tolist(), 'metadata':{'log':samples[i]}}
  prepped.append(v)
index.upsert(prepped)

  0%|          | 0/90 [00:00<?, ?it/s]

{'upserted_count': 90}

In [23]:
good_log_line = samples[0]
print(good_log_line)

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]


In [24]:
results = []
while len(results)==0:  # After the upserts, it might take a few seconds for index to be ready for query.  
    time.sleep(2)       # If results is empty we try again two seconds later.
    queried = index.query(
        vector=emb[0].tolist(),
        include_metadata=True,
        top_k=100
    )
    results = queried['matches']
    print(".:. ",end="")

.:. 

In [25]:
for i in range(0,10) :
  print(f"{round(results[i]['score'], 4)}\t{results[i]['metadata']['log']}")

1.0	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9854	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9736	Apr 30 2013 09:23:41: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.971	Apr 30 2013 09:23:40: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.9592	Sep 12 2014 06:53:01 GIFRCHN01 : %ASA-4-106023: Deny tcp src outside:192.0.2.95/24069 dst inside:10.32.112.125/25 by access-group "PERMIT_IN" [0x0, 0x0]"
0.9241	Dec 11 2018 08:01:39 <IP>: %ASA-4-106023: Deny udp src dmz:192.168.1.34/5679 dst outside:192.0.0.12/5000 by access-group "dmz" [0x123a465e, 0x8c20f21]
0.9116	Dec 11 2018 08:01:24 <IP>: %ASA-4-106023: Deny udp src dmz:192

In [27]:
last_element = len(results) -1 
print(f"{round(results[last_element]['score'], 4)}\t{results[last_element]['metadata']['log']}")

0.2814	dec 31, 2021 09:18:59: %ASA-4-434005: seg fault detected in the matrix
