In [1]:
import torch
from torch.distributions import Categorical
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [2]:
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.eval()
model.to(device)

## Predicting a masked token

In [3]:
# encode the masked sentence
tokens_tensor = tokenizer.encode(f"I don't {tokenizer.mask_token} you!", return_tensors='pt').squeeze()
# get the position of the [MASK] token
masked_idx = tokens_tensor == tokenizer.mask_token_id

# batch size 1, so add dimension
tokens_tensor = tokens_tensor.unsqueeze(0)
tokens_tensor = tokens_tensor.to(device)
# Get outputs for all tokens
with torch.no_grad():
    outputs = model(input_ids=tokens_tensor)
    predictions = outputs[0]

predicted_index = torch.argmax(predictions[0, masked_idx]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)


## Finding probability of given token in mask position

In [4]:
sm = torch.nn.Softmax(dim=0).to(device)

# get only the predictions for the masked token
preds = predictions[0, masked_idx].squeeze()

orig_str = 'banana'
orig_token = tokenizer.encode(orig_str, add_special_tokens=False)
edit_str = 'like'
edit_token = tokenizer.encode(edit_str, add_special_tokens=False)

# convert outputs to probabilities
soft_predictions = sm(preds)

def get_token_prob(_preds, _token_ids):
    return _preds[_token_ids].sum().item()

print(orig_str, get_token_prob(soft_predictions, orig_token))
print(edit_str, get_token_prob(soft_predictions, edit_token))

## Calculate the perplexity of a given token (here of the mask)

In [5]:
entropy = Categorical(probs=soft_predictions).entropy().item()
perplexity = 2 ** entropy
print('entropy', entropy)
print('perplexity', perplexity)