Given a multiple choice question, we seek to determine which words are most important for the BERT-model's answer selection.

Model: RoBERTa adapted for Multiple Choice (non-causal attention). If you want to specialize your model, you can fine-tune it (eg. specifically for physics questions) as described [here](https://huggingface.co/docs/transformers/en/tasks/multiple_choice).

In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, utils

from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients

torch.manual_seed(42)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [17]:
from transformers import RobertaForMultipleChoice, RobertaTokenizer

model_name = "LIAMF-USP/roberta-large-finetuned-race"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMultipleChoice.from_pretrained(model_name)
model.to(device)
model.eval()
model.zero_grad()
# model

In [18]:
def predict(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
    output = model(inputs, token_type_ids=token_type_ids,
                 position_ids=position_ids, attention_mask=attention_mask, )
    return output.logits

def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
    logits = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    return logits.max(1).values

In [19]:
question = ["Two metal balls are the same size but one weighs twice as much as the other. The balls \
are dropped from the roof of a single story building at the same instant of time. The time it takes \
the balls to reach the ground below will be:"]

choices = ["About half as long for the heavier ball as for the lighter one",
           "About half as long for the lighter ball as for the heavier one",
           "About the same for both balls",
           "Considerably less for the heavier ball, but not necessarily half as long",
           "Considerably less for the lighter ball, but not necessarily half as long"]

ground_truth = "About the same for both balls"

# Tokenizing: [CLS] question [SEP] [SEP] choice [SEP] for each choice
encoding = tokenizer(
    question * len(choices),  # Repeat context for each choice
    choices,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

inputs = encoding["input_ids"].unsqueeze(1)  # shape: (num_choices, batches=1, seq_length)

attention_mask = encoding["attention_mask"].unsqueeze(1) # attention mask to ignore padding tokens

logits = predict(inputs, attention_mask=attention_mask)
choice_idx = torch.argmax(logits, dim=0).item()
choice = choices[choice_idx]

print('Question:', question[0])
print('Predicted Answer:', choice)
print('Ground Truth:', ground_truth)

print('Logits:', logits) # Too gauge conviction strength

Question: Two metal balls are the same size but one weighs twice as much as the other. The balls are dropped from the roof of a single story building at the same instant of time. The time it takes the balls to reach the ground below will be:
Predicted Answer: Considerably less for the heavier ball, but not necessarily half as long
Ground Truth: About the same for both balls
Logits: tensor([[-0.7417],
        [ 1.9860],
        [-3.9501],
        [ 5.7018],
        [ 5.5120]], grad_fn=<ViewBackward0>)


In [20]:
# Helper
def print_tokens(input_ids):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    print("Tokens:", tokens)

cls_token_id = tokenizer.cls_token_id # Start
ref_token_id = tokenizer.pad_token_id # Padding
sep_token_id = tokenizer.sep_token_id # Separator


input_ids = inputs  # question + choice ids

# Reference tokens: [CLS] [PAD] ... [PAD] [SEP] [SEP] [PAD] ... [PAD] [SEP] [PAD] ... for each choice
ref_tokens_all = []
for i in range(len(choices)):
    ref_tokens = [cls_token_id]

    for token in input_ids[i,0,1:]:
        if token == sep_token_id:
            ref_tokens += [token]
        else:
            ref_tokens += [ref_token_id]

    ref_tokens_all.append(ref_tokens)
        

ref_input_ids = torch.tensor(ref_tokens_all, dtype=torch.long).unsqueeze(1)

token_type_ids = None # No token type ids for RoBERTa

# Position ids: [0, 1, 2, ...]
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(1).expand_as(input_ids)

In [21]:
print(input_ids.shape, ref_input_ids.shape, position_ids.shape, attention_mask.shape)

lig = LayerIntegratedGradients(predict, model.roberta.embeddings)

attributions_start, delta_start = lig.attribute(inputs=input_ids,
                                  baselines=ref_input_ids,
                                  additional_forward_args=(token_type_ids, position_ids, attention_mask),
                                  target=choice_idx,
                                  return_convergence_delta=True)

torch.Size([5, 1, 68]) torch.Size([5, 1, 68]) torch.Size([5, 1, 68]) torch.Size([5, 1, 68])


KeyboardInterrupt: 

In [8]:
question = ["A large truck breaks down out on the road and receives a push back into town by a small compact car. \
While the car, still pushing the truck, is speeding up to get up to cruising speed:"]
choices = [
"The amount of force with which the car pushes on the truck is equal to that with which the truck pushes back on the car.",
"The amount of force with which the car pushes on the truck is smaller than that with which the truck pushes back on the car.",
"The amount of force with which the car pushes on the truck is greater than that with which the truck pushes back on the car.",
"The car's engine is running so the car pushes against the truck, but the truck's engine is not running so the truck cannot \
push back against the car. The truck is pushed forward simply because it is in the way of the car.",
"Neither the car nor the truck exert any force on the other. The truck is pushed forward simply because it is in the way of the car."]

encoding = tokenizer(
    question * len(choices),  # Repeat context for each choice
    choices,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Add batch dimension
input_ids = encoding["input_ids"].unsqueeze(0)  # shape: (1, num_choices, seq_length)
attention_mask = encoding["attention_mask"].unsqueeze(0)

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits  # shape: (batch_size, num_choices)

# Predicted answer
predicted_choice = torch.argmax(logits, dim=1).item()
print("Predicted:", choices[predicted_choice])

Predicted: Neither the car nor the truck exert any force on the other. The truck is pushed forward simply because it is in the way of the car.


In [None]:
question = ["A large truck breaks down out on the road and receives a push back into town by a small compact car. \
After the car reaches the constant cruising speed at which its driver wishes to push the truck:"]

choices = [
"The amount of force with which the car pushes on the truck is equal to that with which the truck pushes back on the car.",
"The amount of force with which the car pushes on the truck is smaller than that with which the truck pushes back on the car.",
"The amount of force with which the car pushes on the truck is greater than that with which the truck pushes back on the car.",
"The car's engine is running so the car pushes against the truck, but the truck's engine is not running so the truck cannot \
push back against the car. The truck is pushed forward simply because it is in the way of the car.",
"Neither the car nor the truck exert any force on the other. The truck is pushed forward simply because it is in the way of the car."
]

encoding = tokenizer(
    question * len(choices),  # Repeat context for each choice
    choices,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Add batch dimension
input_ids = encoding["input_ids"].unsqueeze(0)  # shape: (1, num_choices, seq_length)
attention_mask = encoding["attention_mask"].unsqueeze(0)

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits  # shape: (batch_size, num_choices)

# Predicted answer
predicted_choice = torch.argmax(logits, dim=1).item()
print("Predicted:", choices[predicted_choice])

Predicted: The amount of force with which the car pushes on the truck is smaller than that with which the truck pushes back on the car.
