Import the Needed Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
from torch import nn
from tqdm.auto import tqdm
from DLAIUtils import Utils
import torch
import time
import torch
import os

Setup Pinecone

In [None]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [None]:
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, dimension=256, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

Load the Dataset

In [None]:
!head -5 sample.log

In [None]:
!head -5 training.txt

Check cuda and Setup the Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#set maximum number of tokens that BERT will process in a single sequence.
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=768)

#Adds a pooling layer after the word embedding model.
#This layer aggregates the word embeddings into a single sentence embedding. It typically computes features like mean and max pooling.
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

#Adds a dense (fully connected) layer to the model.
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

#Combines the word embedding, pooling, and dense layers into a single Sentence Transformer model.
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device=device)
device

Train the Model

In [None]:
train_examples = []
with open('./training.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            #sample,sample and relationship between them
            a, b, label = line.split('^')
            train_examples.append(InputExample(texts=[a, b], label=float(label)))

#Define dataset, the dataloader and the training loss

#used in training to gradually ramp up the learning rate, which can help improve model performance.
warmup_steps=100

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

#computes the cosine similarity between the embeddings of the texts in each InputExample.
train_loss = losses.CosineSimilarityLoss(model)

In [None]:
import pickle
load_pretrained_model = True
if load_pretrained_model:
    #binary read mode
    trained_model_file = open('./data/pretrained_model', 'rb')
    #deserialize the model object from the file.
    db = pickle.load(trained_model_file)
    trained_model_file.close()
else:
    #Training a New Model (if not loading a pre-trained one):
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=16, warmup_steps=100)

samples = []
with open('sample.log', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line: # only the raw line is added to the samples list now.
            #emb = model.encode([line])
            samples.append(line)

Create Embeddings and Upsert to Pinecone

In [None]:
emb = model.encode(samples)

In [None]:
prepped = []
for i in tqdm(range(len(samples))):
  v = {'id':f'{i}', 'values':emb[i].tolist(), 'metadata':{'log':samples[i]}}
  prepped.append(v)
index.upsert(prepped)

Find the Anomaly

In [None]:
good_log_line = samples[0]
print(good_log_line)
results = []
while len(results)==0:  # After the upserts, it might take a few seconds for index to be ready for query.
    time.sleep(2)       # If results is empty we try again two seconds later.
    queried = index.query(
        vector=emb[0].tolist(),
        include_metadata=True,
        top_k=100
    )
    results = queried['matches']
    print(".:. ",end="")

#Iterates through the top 10 results and prints their similarity scores and the corresponding log lines.
for i in range(0,10) :
  print(f"{round(results[i]['score'], 4)}\t{results[i]['metadata']['log']}")
last_element = len(results) -1
#Also prints score of the  last element in the results with the log line
print(f"{round(results[last_element]['score'], 4)}\t{results[last_element]['metadata']['log']}")