In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch




In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

text = ("After Abraham lincoln won the November 1840 presendtial election on an "
        "anti-slavery platform, an inital seven slave states declared their "
        "secession from the country to form the Confederacy. War broke out in "
        "April 1861 when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's inauguration.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
inputs = tokenizer(text,return_tensors='pt')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [25]:
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  8905,  3653,  5054,
         11927,  4818,  2602,  2006,  2019,  3424,  1011,  8864,  4132,  1010,
          2019,  1999, 18400,  2698,  6658,  2163,  4161,  2037, 22965,  2013,
          1996,  2406,  2000,  2433,  1996, 18179,  1012,  2162,  3631,  2041,
          1999,  2258,  6863,  2043, 22965,  2923,  2749,  4457,  3481,  7680,
          3334,  1999,  2148,  3792,  1010,  2074,  2058,  1037,  3204,  2044,
          5367,  1005,  1055, 17331,  1012,   102]])

In [26]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [27]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  8905,  3653,  5054,
         11927,  4818,  2602,  2006,  2019,  3424,  1011,  8864,  4132,  1010,
          2019,  1999, 18400,  2698,  6658,  2163,  4161,  2037, 22965,  2013,
          1996,  2406,  2000,  2433,  1996, 18179,  1012,  2162,  3631,  2041,
          1999,  2258,  6863,  2043, 22965,  2923,  2749,  4457,  3481,  7680,
          3334,  1999,  2148,  3792,  1010,  2074,  2058,  1037,  3204,  2044,
          5367,  1005,  1055, 17331,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1,

In [28]:
rand = torch.rand(inputs.input_ids.shape)
rand.shape

torch.Size([1, 66])

In [29]:
rand

tensor([[0.9760, 0.5645, 0.0032, 0.4449, 0.8277, 0.5206, 0.8263, 0.4495, 0.0427,
         0.6178, 0.3137, 0.1408, 0.2302, 0.5203, 0.6687, 0.1195, 0.1849, 0.3276,
         0.4345, 0.6191, 0.9331, 0.1485, 0.4683, 0.9611, 0.4113, 0.8342, 0.1580,
         0.9325, 0.8956, 0.1992, 0.0721, 0.5748, 0.9541, 0.9652, 0.7009, 0.8720,
         0.1904, 0.2269, 0.4281, 0.1996, 0.6364, 0.2936, 0.8253, 0.5746, 0.8629,
         0.1866, 0.0626, 0.9037, 0.7765, 0.6995, 0.8253, 0.8831, 0.7801, 0.8572,
         0.5187, 0.8377, 0.5399, 0.1830, 0.3269, 0.1035, 0.5963, 0.2721, 0.2778,
         0.7476, 0.7700, 0.1508]])

In [31]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) 
mask_arr

tensor([[False, False,  True, False, False, False, False, False,  True, False,
         False,  True, False, False, False,  True, False, False, False, False,
         False,  True, False, False, False, False, False, False, False, False,
          True, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False,  True,
         False, False, False, False, False, False]])

In [36]:
selection = torch.flatten(mask_arr[0].nonzero()).tolist()
selection

[2, 8, 11, 15, 21, 30, 46, 59]

In [41]:
inputs.input_ids[0,5]

tensor(1996)

In [37]:
inputs.input_ids[0,selection] = 103
inputs.input_ids

tensor([[  101,  2044,   103,  5367,  2180,  1996,  2281,  8905,   103,  5054,
         11927,   103,  2602,  2006,  2019,   103,  1011,  8864,  4132,  1010,
          2019,   103, 18400,  2698,  6658,  2163,  4161,  2037, 22965,  2013,
           103,  2406,  2000,  2433,  1996, 18179,  1012,  2162,  3631,  2041,
          1999,  2258,  6863,  2043, 22965,  2923,   103,  4457,  3481,  7680,
          3334,  1999,  2148,  3792,  1010,  2074,  2058,  1037,  3204,   103,
          5367,  1005,  1055, 17331,  1012,   102]])

In [42]:
outputs = model(**inputs)

In [43]:
outputs.keys()

odict_keys(['loss', 'logits'])

In [44]:
outputs.loss

tensor(0.9248, grad_fn=<NllLossBackward0>)