## Importing Packages

In [1]:
import torch
from tqdm.auto import tqdm
from transformers import AdamW, BertTokenizer, BertForMaskedLM

## Initializing The Model

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased", return_dict = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Importing Text Data

In [3]:
with open("the_fire_flower.txt","r",encoding='utf-8') as f:
    data = f.read().split("\n")

In [4]:
print(len(data))

3948


In [5]:
for line in data:
    if len(line)<50:
        data.remove(line)

## Tokenizing The Text Data

In [6]:
inputs = tokenizer(
    data,
    max_length = 512,
    truncation = True,
    padding = "max_length",
    return_tensors = "pt"
                  )

In [7]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
inputs['labels'] = inputs['input_ids'].detach().clone()

## Masking The Input Ids

In [9]:
random_tensor = torch.rand(inputs["input_ids"].shape)

In [10]:
random_tensor

tensor([[0.2341, 0.7141, 0.4427,  ..., 0.6825, 0.8144, 0.8477],
        [0.6947, 0.6240, 0.7078,  ..., 0.3451, 0.1804, 0.7094],
        [0.5421, 0.3907, 0.1965,  ..., 0.5516, 0.8670, 0.5303],
        ...,
        [0.5047, 0.6257, 0.2561,  ..., 0.6738, 0.6401, 0.1721],
        [0.7809, 0.7423, 0.9326,  ..., 0.3374, 0.1962, 0.6097],
        [0.6259, 0.6667, 0.2767,  ..., 0.5031, 0.2945, 0.4608]])

In [11]:
# creating a mask tensor of float values ranging from 0 to 1 and avoiding special tokens
masked_tensor = (random_tensor < 0.15)*(inputs['input_ids'] != 101)*(inputs['input_ids'] != 102)*(inputs['input_ids'] != 0)

In [12]:
masked_tensor

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [13]:
nonzero_indices = []
for i in range(len(masked_tensor)):
    nonzero_indices.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [14]:
nonzero_indices

[[4, 5],
 [4, 14],
 [8, 13],
 [8, 9, 15],
 [10, 12],
 [2],
 [],
 [1, 3, 9, 10],
 [],
 [5],
 [7],
 [],
 [5, 12],
 [2, 5, 6, 13, 15, 16],
 [1, 2, 8, 12],
 [2, 4],
 [4, 6, 7, 13, 14],
 [3],
 [1, 4],
 [1, 3, 9],
 [1, 4, 10, 12],
 [6, 16],
 [1, 12, 16],
 [2, 8],
 [1, 3, 5, 9, 10, 15],
 [6, 8],
 [6, 15],
 [9, 12, 14],
 [1, 3, 4, 6, 9],
 [11, 12],
 [2],
 [2, 8],
 [10],
 [11],
 [4],
 [10, 12, 13],
 [9, 12, 14, 15, 17, 18, 19, 20, 22],
 [2, 7, 9, 11, 13, 14],
 [1, 5, 6, 10, 15, 16],
 [4, 6, 8],
 [],
 [7, 9, 11, 14],
 [5, 6],
 [3, 6, 9, 15, 16],
 [3, 10],
 [7, 16, 17],
 [5, 9, 10, 12],
 [10, 11],
 [3, 9],
 [2, 3, 6],
 [1, 4, 10, 17],
 [3, 4],
 [6, 9, 11],
 [8, 13, 14],
 [7, 8],
 [5, 6, 11],
 [7, 11],
 [1, 14, 18],
 [4],
 [11],
 [5],
 [],
 [19],
 [3, 4],
 [2],
 [3, 5, 6, 9, 13],
 [2, 8, 9, 12, 18],
 [14],
 [4, 10, 11],
 [12, 13],
 [],
 [1, 11, 16],
 [8, 13],
 [12],
 [12],
 [3, 4],
 [4, 6, 10, 13],
 [1, 8, 9, 13],
 [6, 7, 16],
 [4, 7, 13],
 [1, 9, 12],
 [6],
 [5, 7],
 [4, 7],
 [5, 17],
 [5, 6, 17]

In [15]:
# setting the values at those indices to be a MASK token (103) for every row in the original input_ids.
for i in range(len(inputs["input_ids"])):
    inputs["input_ids"][i,nonzero_indices[i]] = 103

In [16]:
inputs["input_ids"]

tensor([[  101,  1996,  2622,  ...,     0,     0,     0],
        [  101,  2023, 26885,  ...,     0,     0,     0],
        [  101,  2087,  2060,  ...,     0,     0,     0],
        ...,
        [  101,  2164,  2129,  ...,     0,     0,     0],
        [  101,  8756,  3192,  ...,     0,     0,     0],
        [  101,  4942, 29234,  ...,     0,     0,     0]])

## Pytorch Dataset And DataLoader

In [17]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self,index):
        input_ids = self.encodings["input_ids"][index]
        labels = self.encodings["labels"][index]
        attention_mask = self.encodings["attention_mask"][index]
        token_type_ids = self.encodings["token_type_ids"][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

In [18]:
dataset = BookDataset(inputs)

In [19]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size = 16,
    shuffle = True
)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [21]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

## Model Parameters

In [22]:
epochs = 2
optimizer = AdamW(model.parameters(), lr=1e-5)



## Training The Model

In [None]:
model.train()

for epoch in range(epochs):
    loop = tqdm(dataloader)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backwards()
        optimizer.step()

        loop.set_description("Epoch: {}".format(epoch))
        loop.set_postfix(loss=loss.item())