In [1]:
# Code based on HuggingFace examples

In [2]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline
import torch

In [3]:
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Masked word prediction using RoBERTa

In [None]:
tokenizer("Canberra is the capital of Australia.")["input_ids"]

In [14]:
inputs = tokenizer("Canberra is the capital of Australia.", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

MaskedLMOutput(loss=None, logits=tensor([[[33.3342, -3.7221, 18.9761,  ...,  2.8163,  5.5494, 10.8367],
         [ 3.8586, -3.8070, 15.2940,  ...,  1.0708,  1.6881,  3.2313],
         [ 7.3351, -2.8813,  6.5793,  ...,  1.8068,  3.7640,  3.7938],
         ...,
         [ 4.2476, -3.3607,  9.0899,  ...,  1.9920,  1.5115,  4.0887],
         [19.0980, -4.1622, 20.4981,  ...,  1.3208,  3.8108,  6.7397],
         [12.2155, -3.4433, 31.3971,  ...,  2.2916,  0.1929,  8.6062]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [15]:
inputs = tokenizer("Canberra is the capital of <mask>.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

' Australia'

In [16]:
inputs = tokenizer("<mask> is the capital of Australia.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

'Victoria'

In [17]:
inputs = tokenizer("Canberra is the <mask> of Australia.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

' capital'

Using a fine-tuned model

In [20]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

1


In [21]:
inputs = tokenizer("You are adorable!", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print(predicted_class_id)

1


In [22]:
inputs = tokenizer("You suck!", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print(predicted_class_id)

0
