In [2]:
import pandas as pd
import numpy as np
import gc
import re
import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GPT2Model
from utils import *

NameError: name '_C' is not defined

In [None]:
NTDBGPT2_lm = AutoModelForCausalLM.from_pretrained('dracoglacius/NTDB-GPT2')
NTDBGPT2_tokenizer = AutoTokenizer.from_pretrained('dracoglacius/NTDB-GPT2')
NTDBGPT2_embed = GPT2Model.from_pretrained('dracoglacius/NTDB-GPT2')

## ECodes

* E812.0 = Other motor vehicle traffic accident involving collision with motor vehicle injuring driver of motor vehicle other than motorcycle.
* E885.9 = Accidental fall from other slipping tripping or stumbling
* E966.0 = Assault by cutting and piercing instrument

In [None]:
#e8120_top_seq = []
#e8120_top_dsc = []
#with open("./data/E8120_top_50_gen.txt") as f:
#    while True:
#        line1 = f.readline()
#        line2 = f.readline()
#        if not line2: break 
#        e8120_top_seq.append(eval(line1))
#        e8120_top_dsc.append(eval(line2))
#    
#e8120_bot_seq = []
#e8120_bot_dsc = []
#with open("./data/E8120_bot_50_gen.txt") as f:
#    while True:
#        line1 = f.readline()
#        line2 = f.readline()
#        if not line2: break 
#        e8120_bot_seq.append(eval(line1))
#        e8120_bot_dsc.append(eval(line2))
#    
#e8859_top_seq = []
#e8859_top_dsc = []
#with open("./data/E8859_top_50_gen.txt") as f:
#    while True:
#        line1 = f.readline()
#        line2 = f.readline()
#        if not line2: break 
#        e8859_top_seq.append(eval(line1))
#        e8859_top_dsc.append(eval(line2))
#    
#e8859_bot_seq = []
#e8859_bot_dsc = []
#with open("./data/E8859_bot_50_gen.txt") as f:
#    while True:
#        line1 = f.readline()
#        line2 = f.readline()
#        if not line2: break 
#        e8859_bot_seq.append(eval(line1))
#        e8859_bot_dsc.append(eval(line2))
#    
#e9660_top_seq = []
#e9660_top_dsc = []
#with open("./data/E9660_top_50_gen.txt") as f:
#    while True:
#        line1 = f.readline()
#        line2 = f.readline()
#        if not line2: break 
#        e9660_top_seq.append(eval(line1))
#        e9660_top_dsc.append(eval(line2))
#    
#e9660_bot_seq = []
#e9660_bot_dsc = []
#with open("./data/E9660_bot_50_gen.txt") as f:
#   while True:
#       line1 = f.readline()
#       line2 = f.readline()
#       if not line2: break 
#       e9660_bot_seq.append(eval(line1))
#       e9660_bot_dsc.append(eval(line2))

In [None]:
trn_seq = np.load("./data/25k_train_seqs_3_22_E8859_E8120_E9660_E9654_E9240.npy")
gen_seq = np.load("./data/25k_gen_seqs_3_22_E8859_E8120_E9660_E9654_E9240.npy")

## Separate Data

#### Training Data is In Domain Data

In [None]:
e8120_trn_seq = [x for x in trn_seq if 'E812.0' in x] # 5000 items
e8859_trn_seq = [x for x in trn_seq if 'E885.9' in x] # 5000 items
e9660_trn_seq = [x for x in trn_seq if 'E966.0' in x] # 5000 items
e9654_trn_seq = [x for x in trn_seq if 'E965.4' in x] # 5000 items
e9240_trn_seq = [x for x in trn_seq if 'E924.0' in x] # 5000 items

#### Generated Data is Out of Domain (OOD) Data

In [None]:
e8120_gen_seq = [x for x in gen_seq if 'E812.0' in x] # 5000 items
e8859_gen_seq = [x for x in gen_seq if 'E885.9' in x] # 5000 items
e9660_gen_seq = [x for x in gen_seq if 'E966.0' in x] # 5000 items
e9654_gen_seq = [x for x in gen_seq if 'E965.4' in x] # 5000 items
e9240_gen_seq = [x for x in gen_seq if 'E924.0' in x] # 5000 items

We have the intuition that given a distribution with center of the hyper-elliposid $c$ and the shape of the ellipsoid defined by $\Sigma$, $c$ and $\sigma$ should not deviate from the empirical mean ($\hat{c}$) and the covariance estimations ($\hat{\Sigma}$) taken from the training data. 

To obtain these estimates we need to:

1. Feed the NTDB model with the training data (length $n$) and from the last token get the features of each layer ($n$ x 13 x 768)
1. Calculate the sample mean ($\hat{c}$) and the covariance estimate ($\hat{\Sigma}$), while also getting the estimated pseudo-inverse (called `.precision_` in sklearn)

To obtain the OOD estimation we need to:

1. Calculate the Mahalanobis Distance Feature (MDF) using the generated data (should be a vector equal to the number of layers)
1. Calculate the Anomaly Score
  1. This is a one-class SVM with a linear kernel with MDF as features

In [None]:
def get_hidden_embeddings(hidden_states, is_train=True, use_last=True):
    if is_train:
        """
        The first hidden_state contains the whole sequence
        """
        _em = torch.squeeze(torch.stack(hidden_states[0]).transpose(0,2), dim=1)
    else:
        _start = torch.squeeze(torch.stack(hidden_states[0]).transpose(0,2), dim=1)
        _hs = torch.stack([torch.reshape(torch.stack(x), [13, 768]) for x in hidden_states[1:]])
        _em = torch.concat([_start, _hs])
        
    if use_last:
        return _em[-1, :, :]
    else:
        return _em

In [None]:
def get_embeddings(sequences, is_train=True, use_last=True):
    token_layer_embeddings = []
    for seq in tqdm.tqdm(sequences):
        seq_ids = NTDBGPT2_tokenizer.encode(seq, return_tensors='pt')
        if len(seq_ids[0]) > 19:
            continue
        out = NTDBGPT2_lm.generate(
            seq_ids,
            do_sample=True,
            #min_length=10,
            #max_length=12,
            #top_p=0.9, 
            top_k=0,
            return_dict_in_generate=True,
            forced_eos_token_id=NTDBGPT2_tokenizer.eos_token_id,
            #repetition_penalty=3.0,
            #length_penalty=1.0,
            #num_return_seqs=1,
            output_hidden_states=True
        )
        token_layer_embeddings.append(get_hidden_embeddings(out.hidden_states, is_train, use_last))
    if use_last:
        return torch.stack(token_layer_embeddings)
    else:
        return token_layer_embeddings

#### Get Sequence Embeddings of All Layers

In [None]:
def clean_seq(seq):
    return ' '.join(x for x in seq.split() if x)

In [None]:
e8120_trn_all_token_layer_embeddings = get_embeddings(e8120_trn_seq, use_last=False)
e8120_gen_all_token_layer_embeddings = get_embeddings(e8120_gen_seq, use_last=False)
np.save("./outputs/e8120_trn_all_em.npy", e8120_trn_all_token_layer_embeddings)
np.save("./outputs/e8120_gen_all_em.npy", e8120_gen_all_token_layer_embeddings)
e8120_trn_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e8120_trn_all_token_layer_embeddings])
np.save("./outputs/e8120_trn_end_em.npy", e8120_trn_end_token_layer_embeddings)
e8120_gen_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e8120_gen_all_token_layer_embeddings])
np.save("./outputs/e8120_gen_end_em.npy", e8120_gen_end_token_layer_embeddings)

del e8120_trn_all_token_layer_embeddings
del e8120_gen_all_token_layer_embeddings
del e8120_trn_end_token_layer_embeddings
del e8120_gen_end_token_layer_embeddings
gc.collect()

In [None]:
e8859_trn_all_token_layer_embeddings = get_embeddings(e8859_trn_seq, use_last=False)
e8859_gen_all_token_layer_embeddings = get_embeddings(e8859_gen_seq, use_last=False)
np.save("./outputs/e8859_trn_all_em.npy", e8859_trn_all_token_layer_embeddings)
np.save("./outputs/e8859_gen_all_em.npy", e8859_gen_all_token_layer_embeddings)
e8859_trn_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e8859_trn_all_token_layer_embeddings])
np.save("./outputs/e8859_trn_end_em.npy", e8859_trn_end_token_layer_embeddings)
e8859_gen_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e8859_gen_all_token_layer_embeddings])
np.save("./outputs/e8859_gen_end_em.npy", e8859_gen_end_token_layer_embeddings)

del e8859_trn_all_token_layer_embeddings
del e8859_gen_all_token_layer_embeddings
del e8859_trn_end_token_layer_embeddings
del e8859_gen_end_token_layer_embeddings
gc.collect()

In [None]:
e9660_trn_all_token_layer_embeddings = get_embeddings(e9660_trn_seq, use_last=False)
e9660_gen_all_token_layer_embeddings = get_embeddings(e9660_gen_seq, use_last=False)
np.save("./outputs/e9660_trn_all_em.npy", e9660_trn_all_token_layer_embeddings)
np.save("./outputs/e9660_gen_all_em.npy", e9660_gen_all_token_layer_embeddings)
e9660_trn_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9660_trn_all_token_layer_embeddings])
np.save("./outputs/e9660_trn_end_em.npy", e9660_trn_end_token_layer_embeddings)
e9660_gen_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9660_gen_all_token_layer_embeddings])
np.save("./outputs/e9660_gen_end_em.npy", e9660_gen_end_token_layer_embeddings)

del e9660_trn_all_token_layer_embeddings
del e9660_gen_all_token_layer_embeddings
del e9660_trn_end_token_layer_embeddings
del e9660_gen_end_token_layer_embeddings
gc.collect()

In [None]:
e9654_trn_all_token_layer_embeddings = get_embeddings(e9654_trn_seq, use_last=False)
e9654_gen_all_token_layer_embeddings = get_embeddings(e9654_gen_seq, use_last=False)
np.save("./outputs/e9654_trn_all_em.npy", e9654_trn_all_token_layer_embeddings)
np.save("./outputs/e9654_gen_all_em.npy", e9654_gen_all_token_layer_embeddings)
e9654_trn_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9654_trn_all_token_layer_embeddings])
np.save("./outputs/e9654_trn_end_em.npy", e9654_trn_end_token_layer_embeddings)
e9654_gen_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9654_gen_all_token_layer_embeddings])
np.save("./outputs/e9654_gen_end_em.npy", e9654_gen_end_token_layer_embeddings)

del e9654_trn_all_token_layer_embeddings
del e9654_gen_all_token_layer_embeddings
del e9654_trn_end_token_layer_embeddings
del e9654_gen_end_token_layer_embeddings
gc.collect()

In [None]:
e9240_trn_all_token_layer_embeddings = get_embeddings(e9240_trn_seq, use_last=False)
e9240_gen_all_token_layer_embeddings = get_embeddings(e9240_gen_seq, use_last=False)
np.save("./outputs/e9240_trn_all_em.npy", e9240_trn_all_token_layer_embeddings)
np.save("./outputs/e9240_gen_all_em.npy", e9240_gen_all_token_layer_embeddings)
e9240_trn_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9240_trn_all_token_layer_embeddings])
np.save("./outputs/e9240_trn_end_em.npy", e9240_trn_end_token_layer_embeddings)
e9240_gen_end_token_layer_embeddings = torch.stack([x[-1,:,:] for x in e9240_gen_all_token_layer_embeddings])
np.save("./outputs/e9240_gen_end_em.npy", e9240_gen_end_token_layer_embeddings)

del e9240_trn_all_token_layer_embeddings
del e9240_gen_all_token_layer_embeddings
del e9240_trn_end_token_layer_embeddings
del e9240_gen_end_token_layer_embeddings
gc.collect()