In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 75.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [3]:
from transformers import AutoTokenizer, AutoModel, utils
import numpy as np

In [15]:
def load_model_tokenizer(model_path):
  model = AutoModel.from_pretrained(model_path, output_attentions=True)
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  return model, tokenizer

In [5]:
def text_tokenization(input_text, model, tokenizer):
  batch_encoding = tokenizer.encode_plus(input_text, return_tensors='pt')
  tokenized_inputs = batch_encoding["input_ids"]
  outputs = model(tokenized_inputs)  # Run model
  attention = outputs[-1]  # Retrieve attention from model outputs
  return attention, tokenized_inputs, batch_encoding

In [6]:
def calculate_total_attention(attention):
  layer_sums = np.zeros((1, attention[0][0][0].shape[0]))
  for layer in attention:
    head_sums = np.zeros((1, layer[0][0].shape[1]))
    for head in layer[0]:
      head = head.detach().numpy()
      head_sums +=np.sum(head, axis = 0)
    layer_sums += head_sums
  return layer_sums[0]

In [7]:
def filter_tokens(inputs, layer_sums):
  ids = inputs[0].detach().numpy()
  out = [101, 102, 1010, 1011, 1012, 100, 1005, 1025, 1000]
  mask1 = np.ones(ids.shape, dtype = bool)
  for i in range(len(mask1)):
    if ids[i] in out:
      mask1[i] = 0
  ids = ids[mask1]
  layer_sums = layer_sums[mask1]
  return ids, layer_sums, mask1


In [8]:
def arbitrary_threshold(layer_sums, ids, threshold = 1.2):
  mean = np.mean(layer_sums)
  mask2 = np.zeros(layer_sums.shape, dtype = bool)
  for i, k in enumerate(layer_sums):
    if k > threshold*mean:
      mask2[i] = 1
  ids = ids[mask2]
  layer_sums = layer_sums[mask2]
  return ids, layer_sums, mask2

In [9]:
def get_word_indices(mask1, mask2):
  indices = np.arange(0,len(mask1))
  indices= indices[mask1]
  indices= indices[mask2]
  return indices

In [10]:
def get_corresponding_spans(batch_encoding, indices):
  all_spans = []
  for i in indices:
    all_spans.append([batch_encoding.token_to_chars(i)[0], batch_encoding.token_to_chars(i)[1]])
  return all_spans

In [11]:
def spans_to_words(all_spans, input_text):
  words = []
  for i in all_spans:
    words.append(input_text[i[0]:i[1]])
  return words