In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
text = ("After Abraham Lincoln won the November 1860 presidential election on an "
"anti-slavery platform, an initial seven slave states declared their "
"secession from the country to form the Confederacy. War broke out in "
"April 1861 when secessionist forces attacked Fort Sumter in South " 
"Carolina, just over a month after Lincoln's inauguration. ")

In [8]:
inputs = tokenizer(text, return_tensors='pt')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [10]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [12]:
rand = torch.rand(inputs.input_ids.shape)
rand.shape, inputs.input_ids.shape

(torch.Size([1, 62]), torch.Size([1, 62]))

In [13]:
rand

tensor([[0.8981, 0.6243, 0.2427, 0.8994, 0.0690, 0.5407, 0.0463, 0.6064, 0.6985,
         0.8371, 0.4570, 0.9200, 0.8638, 0.4466, 0.2016, 0.3331, 0.3562, 0.1827,
         0.6555, 0.4130, 0.6602, 0.8679, 0.1477, 0.9939, 0.5743, 0.6230, 0.2482,
         0.0192, 0.1428, 0.8060, 0.0637, 0.0351, 0.9526, 0.0058, 0.0798, 0.5983,
         0.0612, 0.2884, 0.1708, 0.3819, 0.5890, 0.8879, 0.9492, 0.4336, 0.6549,
         0.3554, 0.7457, 0.6070, 0.9591, 0.3312, 0.3053, 0.9616, 0.8504, 0.9749,
         0.3397, 0.9793, 0.7917, 0.6869, 0.8663, 0.8160, 0.8905, 0.6767]])

In [14]:
rand < 0.15

tensor([[False, False, False, False,  True, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False, False, False, False,  True,  True, False,
          True,  True, False,  True,  True, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]])

In [18]:
# Count the number of True values
(rand < 0.15).sum()

tensor(10)

In [19]:
10/62

0.16129032258064516

In [20]:
masked_arr = (rand < 0.15) * (inputs.input_ids != tokenizer.cls_token_id) * (inputs.input_ids != tokenizer.sep_token_id)

In [21]:
masked_arr

tensor([[False, False, False, False,  True, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False, False, False, False,  True,  True, False,
          True,  True, False,  True,  True, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]])

In [23]:
selection = torch.flatten(masked_arr[0].nonzero()).tolist()
selection

[4, 6, 22, 27, 28, 30, 31, 33, 34, 36]

In [24]:
# Apply the mask to the input IDs
inputs.input_ids[0, selection] = tokenizer.mask_token_id
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,   103,  1996,   103,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,   103,  2037, 22965,  2013,  1996,   103,   103,  2433,
           103,   103,  1012,   103,   103,  2041,   103,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [25]:
outputs = model(**inputs)
outputs.keys()

odict_keys(['loss', 'logits'])

In [26]:
outputs.loss

tensor(0.8126, grad_fn=<NllLossBackward0>)

In [27]:
outputs.logits

tensor([[[ -6.9592,  -6.9119,  -6.9288,  ...,  -6.2163,  -5.9599,  -4.2301],
         [ -8.1888,  -8.1237,  -8.2197,  ...,  -7.9586,  -7.3091,  -6.8609],
         [ -9.1328,  -9.5236,  -8.8170,  ...,  -7.6412,  -7.6413,  -8.4924],
         ...,
         [ -1.8444,  -1.7051,  -1.7062,  ...,  -1.2344,  -0.8263,  -8.2977],
         [-14.2312, -14.0290, -14.2061,  ..., -10.7771, -11.0169,  -8.8218],
         [-13.5270, -13.7070, -13.3891,  ..., -11.5713, -11.2686,  -8.2835]]],
       grad_fn=<ViewBackward0>)

In [28]:
outputs.logits.shape

torch.Size([1, 62, 30522])