In [2]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [3]:
sentences = [
    "The sky turned a fiery orange as the sun dipped below the horizon, casting long shadows across the tranquil meadow.",
    "Lost in thought, she wandered through the labyrinth of bustling streets, each alleyway revealing a new adventure waiting to unfold.",
    "With a flicker of hesitation, he reached out and grasped the ancient artifact, feeling its power surge through his veins.",
    "Laughter echoed through the halls as friends gathered around the crackling fireplace, swapping stories late into the night.",
    "Time seemed to stand still as they danced beneath the twinkling stars, their hearts beating in perfect harmony.",
    "The aroma of freshly brewed coffee filled the air, awakening her senses with each comforting sip.",
    "Thunder rumbled ominously in the distance, signaling an approaching storm that would soon unleash its fury upon the unsuspecting town.",
    "In the quiet solitude of the forest, she found solace among the towering trees, their gentle whispers soothing her troubled mind.",
    "With a flourish of his pen, he signed his name at the bottom of the contract, sealing the deal with a sense of satisfaction.",
    "As dawn broke over the horizon, painting the sky in hues of pink and gold, she knew that today held endless possibilities waiting to be discovered.",
    "This is anomaly :)"
]


# No FineTune, Stock BERT MLM


In [4]:
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
tokenized_sentences = [tokenizer(sentence, add_special_tokens=True, padding='max_length', truncation=True, max_length=32, return_tensors='pt') for sentence in sentences]


In [6]:
dataloader = torch.utils.data.DataLoader(tokenized_sentences, batch_size=1)

In [324]:

len(dataloader)

11

In [7]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import numpy as np

KEY = []
DICT = {}

def abn_loss(loss : list, k):
    scores = torch.topk(torch.Tensor(loss), k)
    return sum(scores.values.tolist())

def abn_prob(probs, k):
    scores = torch.topk(torch.Tensor(probs), k, largest=False)
    return sum(scores.values.tolist())

In [9]:
from datasets import Dataset

for n in np.arange(0, len(tokenized_sentences)):
    data = tokenized_sentences[n]
    phrase = data.input_ids
    data['labels'] = phrase.clone()
    if phrase not in DICT:
        score_loss, score_prob = [], []
        with torch.no_grad():        
            for i in np.arange(1, len(phrase[0])):
                temp = phrase[0][i].item()
                phrase[0][i] = 103

                outputs = model(**data)

                outputs_prob = max(torch.nn.functional.softmax(outputs.logits[0][i], dim=0))

                score_loss.append(outputs.loss)
                score_prob.append(outputs_prob)

                #print(outputs.loss, outputs_prob)

                
                phrase[0][i] = temp
        
        agg_loss = abn_loss(score_loss, 3)
        agg_prob = abn_prob(score_prob, 3)
        #agg_prob = abn_prob(outputs.loss, 5)
        KEY.append(phrase)
        DICT.update({phrase : (agg_loss, agg_prob)})

        print(agg_loss, agg_prob)
    
    else:
        
        print('From Database', DICT[phrase])

17.436710834503174 0.4099971130490303
11.005641460418701 0.31468571349978447
17.00260639190674 0.3297392353415489
18.038761615753174 0.21984421834349632
19.622427463531494 0.29583748430013657
25.685500144958496 0.39338643103837967
8.24725866317749 0.38908626511693
13.110359191894531 0.22921616584062576
8.224310636520386 0.36272698640823364
3.711495280265808 0.501497071236372
50.586917877197266 0.5157705321907997


In [10]:
scores = DICT.values()

In [15]:
for i, score in enumerate(scores):
    anomaly_score = score[0] * 1/score[1]
    print('Loss', score[0], 'Prob of Anomaly', score[1], 'Anomaly Score', anomaly_score)

Loss 17.436710834503174 Prob of Anomaly 0.4099971130490303 Anomaly Score 42.52886247132665
Loss 11.005641460418701 Prob of Anomaly 0.31468571349978447 Anomaly Score 34.973438539739234
Loss 17.00260639190674 Prob of Anomaly 0.3297392353415489 Anomaly Score 51.56379517376869
Loss 18.038761615753174 Prob of Anomaly 0.21984421834349632 Anomaly Score 82.05247220815447
Loss 19.622427463531494 Prob of Anomaly 0.29583748430013657 Anomaly Score 66.32840158829879
Loss 25.685500144958496 Prob of Anomaly 0.39338643103837967 Anomaly Score 65.29330479742083
Loss 8.24725866317749 Prob of Anomaly 0.38908626511693 Anomaly Score 21.19647852565288
Loss 13.110359191894531 Prob of Anomaly 0.22921616584062576 Anomaly Score 57.1964858753033
Loss 8.224310636520386 Prob of Anomaly 0.36272698640823364 Anomaly Score 22.673555993058308
Loss 3.711495280265808 Prob of Anomaly 0.501497071236372 Anomaly Score 7.4008314168527995
Loss 50.586917877197266 Prob of Anomaly 0.5157705321907997 Anomaly Score 98.08027934888607

In [233]:
test = tokenized_sentences[0]
test['labels'] = test.input_ids.detach().clone()
test.input_ids[0, 10] = 103
test

{'input_ids': tensor([[  101,  1996,  3712,  2357,  1037, 15443,  4589,  2004,  1996,  3103,
           103,  2917,  1996,  9154,  1010,  9179,  2146,  6281,  2408,  1996,
         25283, 26147, 13244,  1012,   102,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[  101,  1996,  3712,  2357,  1037, 15443,  4589,  2004,  1996,  3103,
         13537,  2917,  1996,  9154,  1010,  9179,  2146,  6281,  2408,  1996,
         25283, 26147, 13244,  1012,   102,     0,     0,     0,     0,     0,
             0,     0]])}

In [269]:
outputs = model(**test)
#print(outputs.loss)
#print(outputs.logits.shape)
#print(outputs.logits[0][10])
proba = torch.nn.functional.softmax(outputs.logits[0][10], dim=0)
print(max(proba))

tensor(0.6932, grad_fn=<UnbindBackward0>)


In [245]:
max(torch.nn.functional.softmax(outputs.logits[0][10], dim = 0))

tensor(0.9999, grad_fn=<AddBackward0>)