# Training BERT-Masked language Modeling




### Objective: The objective is to learn contextual representations for each token in the sentence. By predicting the masked tokens, the model learns to understand the relationships between words and their context.

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

text = ("After Abraham Lincoln won the November 1860 presidential [MASK] on an anti-slavery platform,"
        "an initial seven slave states declared their secession from the country to form the Confederacy."
        "War broke out in April 1861 when secessionist forces [MASK] Fort Sumter in South Carolina, just"
        "over a month after Lincoln’s inauguration.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identica

# 1.Tokenization — tokenization is simple, we’ve already initialized a BertTokenizer, all we do now is tokenize our input text.



In [None]:
inputs = tokenizer(text, return_tensors = 'pt')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
inputs.input_ids

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,   103,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,   103,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  7840,  1037,  3204,  2044,  5367,  1521,  1055, 17331,
          1012,   102]])

# 2.Create labels — The next step is easy, all we need to do here is clone our input_ids tensor into a new labels tensor. We’ll store this within the inputs variable too.



In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,   103,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,   103,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  7840,  1037,  3204,  2044,  5367,  1521,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([

# 3.Masking — Now we need to mask a random selection of tokens in our input_ids tensor.

In [None]:
rand = torch.rand(inputs.input_ids.shape)
rand.shape

torch.Size([1, 62])

In [None]:
rand

tensor([[0.9800, 0.4654, 0.8416, 0.5915, 0.4076, 0.1713, 0.1807, 0.4198, 0.2993,
         0.8467, 0.2840, 0.5585, 0.5689, 0.2452, 0.8187, 0.3732, 0.0650, 0.3584,
         0.4466, 0.6148, 0.1163, 0.6702, 0.9334, 0.8375, 0.0562, 0.3789, 0.8837,
         0.1572, 0.7461, 0.4864, 0.9627, 0.2180, 0.2571, 0.0957, 0.4333, 0.5788,
         0.4881, 0.9576, 0.5462, 0.7327, 0.2004, 0.9011, 0.4318, 0.4640, 0.2967,
         0.5253, 0.7868, 0.3934, 0.5132, 0.9406, 0.6474, 0.5167, 0.5293, 0.0209,
         0.8263, 0.6886, 0.9791, 0.1328, 0.9305, 0.7169, 0.8051, 0.5472]])

In [None]:
mask_arr = (rand > 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
mask_arr

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True, False,  True,  True,  True,
         False,  True,  True,  True, False,  True,  True,  True,  True,  True,
          True,  True,  True, False,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True, False,  True,  True,  True, False,  True,  True,
          True, False]])

In [None]:
selection = torch.flatten(mask_arr[0].nonzero()).tolist()
selection

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 21,
 22,
 23,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 54,
 55,
 56,
 58,
 59,
 60]

In [None]:
inputs.input_ids[0, selection] = 103
inputs.input_ids

tensor([[  101,   103,   103,   103,   103,   103,   103,   103,   103,   103,
           103,   103,   103,   103,   103,   103,  1010,   103,   103,   103,
          6658,   103,   103,   103, 22965,   103,   103,   103,   103,   103,
           103,   103,   103,  2162,   103,   103,   103,   103,   103,   103,
           103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
           103,   103,   103,  1037,   103,   103,   103,  1521,   103,   103,
           103,   102]])

# 4. Calculate Loss — Our final step here no different from the typical model training process.



In [None]:
outputs = model(**inputs)

In [None]:
outputs.keys()

odict_keys(['loss', 'logits'])

In [None]:
outputs.loss

tensor(8.2910, grad_fn=<NllLossBackward0>)