In [2]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

In [22]:
sentences = [
    "The sky turned a fiery orange as the sun dipped below the horizon, casting long shadows across the tranquil meadow.",
    "Lost in thought, she wandered through the labyrinth of bustling streets, each alleyway revealing a new adventure waiting to unfold.",
    "With a flicker of hesitation, he reached out and grasped the ancient artifact, feeling its power surge through his veins.",
    "Laughter echoed through the halls as friends gathered around the crackling fireplace, swapping stories late into the night.",
    "Time seemed to stand still as they danced beneath the twinkling stars, their hearts beating in perfect harmony.",
    "The aroma of freshly brewed coffee filled the air, awakening her senses with each comforting sip.",
    "Thunder rumbled ominously in the distance, signaling an approaching storm that would soon unleash its fury upon the unsuspecting town.",
    "In the quiet solitude of the forest, she found solace among the towering trees, their gentle whispers soothing her troubled mind.",
    "With a flourish of his pen, he signed his name at the bottom of the contract, sealing the deal with a sense of satisfaction.",
    "As dawn broke over the horizon, painting the sky in hues of pink and gold, she knew that today held endless possibilities waiting to be discovered.",
    "This is an anomaly, This is an anomaly. This. Is definitely an Anomaly. Anomaly is what this is"
]


# No FineTune, Stock BERT MLM


In [23]:
tokenized_sentences = [tokenizer(sentence, add_special_tokens=True, padding='max_length', truncation=True, max_length=32, return_tensors='pt') for sentence in sentences]


In [24]:
dataloader = torch.utils.data.DataLoader(tokenized_sentences, batch_size=1)

In [324]:

len(dataloader)

11

In [7]:
model = DistilBertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
import numpy as np

KEY = []
DICT = {}

def abn_loss(loss : list, k):
    scores = torch.topk(torch.Tensor(loss), k)
    return (sum(scores.values.tolist()))/k

def abn_prob(probs, k):
    scores = torch.topk(torch.Tensor(probs), k, largest=False)
    return (sum(scores.values.tolist()))/k

In [26]:
from datasets import Dataset

for n in np.arange(0, len(tokenized_sentences)):
    data = tokenized_sentences[n]
    phrase = data.input_ids
    data['labels'] = phrase.clone()
    if phrase not in DICT:
        score_loss, score_prob = [], []
        with torch.no_grad():        
            for i in np.arange(1, len(phrase[0])):
                temp = phrase[0][i].item()
                phrase[0][i] = 103

                outputs = model(**data)

                outputs_prob = max(torch.nn.functional.softmax(outputs.logits[0][i], dim=0))

                score_loss.append(outputs.loss)
                score_prob.append(outputs_prob)

                #print(outputs.loss, outputs_prob)

                
                phrase[0][i] = temp
        
        agg_loss = abn_loss(score_loss, 3)
        agg_prob = abn_prob(score_prob, 3)
        #agg_prob = abn_prob(outputs.loss, 5)
        KEY.append(phrase)
        DICT.update({phrase : (agg_loss, agg_prob)})

        print(agg_loss, agg_prob)
    
    else:
        
        print('From Database', DICT[phrase])

5.812236944834392 0.13666570434967676
3.6685471534729004 0.10489523783326149
5.667535463968913 0.10991307844718297
6.012920538584392 0.07328140611449878
6.540809154510498 0.09861249476671219
8.561833381652832 0.13112881034612656
2.7490862210591636 0.12969542170564333
4.370119730631511 0.07640538861354192
2.7414368788401284 0.12090899546941121
1.237165093421936 0.167165690412124
7.45561949412028 0.3833014965057373


In [27]:
scores = DICT.values()

In [28]:
for i, score in enumerate(scores):
    anomaly_score = score[0] * 1/(1-score[1])
    print('Anomaly Score', anomaly_score)

Anomaly Score 6.732313281330046
Anomaly Score 4.098455631710213
Anomaly Score 6.367395505690066
Anomaly Score 6.488399583495683
Anomaly Score 7.256378767772772
Anomaly Score 9.8539731592016
Anomaly Score 3.1587633681611638
Anomaly Score 4.731642732379401
Anomaly Score 3.1184904233026285
Anomaly Score 1.4854876644480957
Anomaly Score 12.089569622556482


In [233]:
test = tokenized_sentences[0]
test['labels'] = test.input_ids.detach().clone()
test.input_ids[0, 10] = 103
test

{'input_ids': tensor([[  101,  1996,  3712,  2357,  1037, 15443,  4589,  2004,  1996,  3103,
           103,  2917,  1996,  9154,  1010,  9179,  2146,  6281,  2408,  1996,
         25283, 26147, 13244,  1012,   102,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[  101,  1996,  3712,  2357,  1037, 15443,  4589,  2004,  1996,  3103,
         13537,  2917,  1996,  9154,  1010,  9179,  2146,  6281,  2408,  1996,
         25283, 26147, 13244,  1012,   102,     0,     0,     0,     0,     0,
             0,     0]])}

In [269]:
outputs = model(**test)
#print(outputs.loss)
#print(outputs.logits.shape)
#print(outputs.logits[0][10])
proba = torch.nn.functional.softmax(outputs.logits[0][10], dim=0)
print(max(proba))

tensor(0.6932, grad_fn=<UnbindBackward0>)


In [245]:
max(torch.nn.functional.softmax(outputs.logits[0][10], dim = 0))

tensor(0.9999, grad_fn=<AddBackward0>)