# Visualizing BERT Attentions
BertViz from this [repository](https://github.com/jessevig/bertviz)

checked 27.02.2024 GPaass

extend by

In [None]:
!pip install bertviz

In [None]:
# tag: parameters for papermill. View > Cell Toolbar > Tags. Need papermill library
prm = "small"              # small: just use 1 epoch

In [None]:
import sys, os
# insert at 1, 0 is the script path (or '' in REPL)
#sys.path.insert(1, '../bertviz-master')
#os.listdir('../bertviz-master')

from bertviz import head_view
from transformers import BertTokenizer, BertModel

In [None]:
import tensorflow as tf

print(tf.config.list_physical_devices())
import torch
print(torch.cuda.is_available())
! nvidia-smi

In [None]:
# clear GPU memory
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import transformers
print(transformers.__version__)

## Predicting Masked Words

In [None]:
from transformers import pipeline

Pipelines are made of:
- A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token.
- A :doc:`model <model>` to make predictions from the inputs.
- Some (optional) post processing for enhancing model's output.

The first task argument determines the task:
* `"fill-mask"`: will return a :class:`transformers.FillMaskPipeline`:. <br>
    Masked language modeling prediction pipeline. This pipeline only works for inputs with exactly one token masked.

### Predict English Tokens

In [None]:
nlp = pipeline("fill-mask", model="bert-base-uncased", top_k=10)

In [None]:
nlp(f"This is the best thing I've {nlp.tokenizer.mask_token} in my life.")

### What does Pipeline do?
Before the model is trained a **tokenizer** ist estimated from the training data.

Steps for model application.

* tokenize the input to a sequence of integers
* apply the model to the input
* extract the ouput and decode the predicted tokens

In [None]:
inputs = f"This is the best thing I've {nlp.tokenizer.mask_token} in my life."

model_inputs=nlp.tokenizer(inputs, return_tensors=nlp.framework)  # apply the tokenizer
print(model_inputs)
nlp.tokenizer.decode(model_inputs['input_ids'].numpy()[0])

In [None]:
model_outputs=nlp.model(**model_inputs)                 # apply the model
print("predictions for each token")
print(model_outputs,"\n")
model_outputs["input_ids"] = model_inputs["input_ids"]
input_ids = model_outputs["input_ids"][0]
outputs = model_outputs["logits"]
                                                        # position of the mask
masked_index = torch.nonzero(input_ids == nlp.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
logits = outputs[0, masked_index, :]
print("logits.shape",logits.shape)
probs = logits.softmax(dim=-1)                          # compute the probabilities
values, predictions = probs.topk(5)                     # get top k probabilities and indices
values = values.detach().numpy()[0]
predictions = predictions.detach().numpy()[0]
for i in range(len(values)):
    print(values[i],"\t",nlp.tokenizer.decode(predictions[i]))

In [None]:
def print_res(res, nlp):
  for i in range(10):
    r =res[i]
    print("{:6.4f}".format(r['score']),'\t',r['token'],'\t',nlp.tokenizer.decode(r['token']))

In [None]:
print_res(nlp(f"The man went to the {nlp.tokenizer.mask_token} to buy some food."), nlp)

### Predict German Tokens

In [None]:
gnlp = pipeline("fill-mask", model="bert-base-german-cased", top_k=10)

In [None]:
print_res(gnlp(f"Dies ist das Beste, was ich je in meinem Leben {nlp.tokenizer.mask_token} getan habe."), gnlp)

In [None]:
print_res(gnlp(f"Ich gehe zur {nlp.tokenizer.mask_token} , um Geld abzuheben."), gnlp)

In [None]:
print_res(gnlp(f"Ich gehe zur {nlp.tokenizer.mask_token} und setze mich."), gnlp)

## Show Attention Strength
### Show Attention Strength for All  Heads
[notebook](https://github.com/jessevig/bertviz/blob/master/notebooks/model_view_bert.ipynb)

In [None]:
from bertviz import model_view
from bertviz.neuron_view import show
from transformers import BertTokenizer, BertModel

In [None]:
def show_model_view(model, tokenizer, sentence_a, sentence_b=None, hide_delimiter_attn=False, display_mode="dark"):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    if hide_delimiter_attn:
        for i, t in enumerate(tokens):
            if t in ("[SEP]", "[CLS]"):
                for layer_attn in attention:
                    layer_attn[0, :, i, :] = 0
                    layer_attn[0, :, :, i] = 0
    model_view(attention, tokens, sentence_b_start, display_mode=display_mode)

In [None]:
model_version = 'bert-base-uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "The cat sat on the mat"
sentence_b = "The cat lay on the rug"
#sentence_a = "I go to the bank to get money"
#sentence_b = "I go to the bank of the river"
show_model_view(model, tokenizer, sentence_a, sentence_b, hide_delimiter_attn=False, display_mode="dark")

### Show Attention Strength for a Single Head

In [None]:
def show_head_view(model, tokenizer, sentence_a, sentence_b=None, layer=None, heads=None):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    head_view(attention, tokens, sentence_b_start, layer=layer, heads=heads)

In [None]:
model_version = 'bert-base-uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "the rabbit quickly hopped"
sentence_b = "The turtle slowly crawled"
show_head_view(model, tokenizer, sentence_a, sentence_b)

In [None]:
model_version = 'bert-base-uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "The boy met the girl"
sentence_b = "She looked very pretty"
show_model_view(model, tokenizer, sentence_a, sentence_b, hide_delimiter_attn=False, display_mode="dark")

### Show Attention Strength for German Sentence

In [None]:
model_version = 'bert-base-german-cased'
do_lower_case = False
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "Der Junge traf das Mädchen"
sentence_b = "Sie sah sehr schön aus"
show_model_view(model, tokenizer, sentence_a, sentence_b, hide_delimiter_attn=False, display_mode="dark")

The nex visualization shows the association to different parts of the vectors.

In [None]:
model_version = 'bert-base-german-cased'
do_lower_case = False
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "Der Junge traf das Mädchen"
sentence_b = "Sie sah sehr schön aus"
show_head_view(model, tokenizer, sentence_a, sentence_b)