In [7]:
#! pip install transformers

In [8]:
import torch
from transformers import BertTokenizer, BertForPreTraining

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [10]:
#splitting has been done based on Paragraph
with open('data_for_pretraining.txt', 'r') as fp:
    text = fp.read().split('\n')

In [11]:
text[14]

'From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining. I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious. He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved. I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man. He had also the

# Preparing for NSP

In [12]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [13]:
bag_size

1386

In [14]:
bag[0]

'From my grandfather Verus I learned good morals and the government of my temper'

In [15]:
import random

sentence_a = []
sentence_b = []
label = []

# for each paragraph in text we will create Sentence A & Sentence B
for paragraph in text:
    sentences = [sentence for sentence in paragraph.split('.') if sentence != '']

    num_sentences = len(sentences)
    
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        
        # this is IsNextSentence
        if random.random() >= 0.5:    
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        
        # this is NotNextSentence
        else:
            index = random.randint(0, bag_size-1)
            
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [16]:
# Check
for i in range(3):
    print(label[i])
    print('Sentence A ==>', sentence_a[i])
    print('Sentence B ==>', sentence_b[i])
    print('='*100)

1
Sentence A ==>  I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man
Sentence B ==> - But this very thing is necessary, the observation of what a man is doing: for, it may be said, it is characteristic of the social animal to perceive that he is working in a social manner, and indeed to wish that his social partner also should perceive it
1
Sentence A ==>  He took a reasonable care of his body's health, not as one who was greatly attached to life, nor out of regard to personal appearance, nor yet in a careless way, but so that, through his own attention, he very seldom stood in need of the physician's art or of medicine or external applications
Sentence B ==>  It is thy duty then in the midst of such things to show good humour and not a proud air; to understand however that every man is worth just so much as the things are worth about which he busies himself
0
Sentence A ==>  Further, I owe it to the gods that I 

# BERT Tokenization for NSP

In [17]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [18]:
inputs

{'input_ids': tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
        [  101,  2002,  2165,  ...,     0,     0,     0],
        [  101,  2582,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [19]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

# NSP Labels

In [20]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [21]:
inputs.next_sentence_label[:10]

tensor([[1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1]])

# Masked LM

In [22]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [23]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [24]:
len(inputs['input_ids'])

317

In [25]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)

In [26]:
rand

tensor([[0.5397, 0.9912, 0.4956,  ..., 0.0365, 0.0505, 0.5613],
        [0.3651, 0.6568, 0.3211,  ..., 0.4971, 0.3506, 0.0239],
        [0.2462, 0.8782, 0.1243,  ..., 0.6907, 0.4809, 0.7539],
        ...,
        [0.3403, 0.4099, 0.9376,  ..., 0.2255, 0.0289, 0.6957],
        [0.2422, 0.1009, 0.9739,  ..., 0.5159, 0.4374, 0.1935],
        [0.9513, 0.7606, 0.0760,  ..., 0.1632, 0.3310, 0.3477]])

In [27]:
# create mask array 15% Masking
# [CLS] token is [101]
# [SEP] token is [102]
# [PAD] token is [0]
#we are ensuring that we don’t mask any special tokens — such as CLS (101), SEP (102), and PAD (0) tokens.

mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [28]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False]])

In [29]:
mask_arr.shape

torch.Size([317, 512])

In [30]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

In [31]:
#Masked indexes for sentence
selection[0]

[8, 10, 16, 38, 40, 46, 48, 58, 60, 62, 64, 69]

In [32]:
#Now replacing with masked token number which is 103
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [33]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [34]:
inputs.input_ids

tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
        [  101,  2002,  2165,  ...,     0,     0,     0],
        [  101,  2582,   103,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,   103, 15223,  ...,     0,     0,     0],
        [  101,  7887,   103,  ...,     0,     0,     0]])

# Pytorch Data Loader 

In [35]:
class AnyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [36]:
#initializing our dataset using the above CLASS

dataset = AnyDataset(inputs)

In [37]:
# Initialize the dataloader, which we'll be using to load our data into the model during training.
# The dataloader expects the __len__ method for checking the total number of samples within our dataset, 
# and the __getitem__ method for extracting samples.

loader = torch.utils.data.DataLoader(dataset, batch_size = 8, shuffle = True)

# Setup for Training

In [38]:
# GPU check

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [39]:
# moving model to the device

model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [40]:
# activate the training mode of our model

model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [41]:
# initialzing Optimizer (ADAM)

from transformers import AdamW

optim = AdamW(model.parameters(), lr = 5e-5)



In [42]:
torch.cuda.empty_cache()

In [43]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [44]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)

        # process through the model
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        
        # extract loss
        loss = outputs.loss
        
        # calculate loss for every parameter that needs grad update
        loss.backward()
        
        # update parameters
        optim.step()
        
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 40/40 [35:39<00:00, 53.48s/it, loss=1.01]
Epoch 1: 100%|██████████| 40/40 [35:07<00:00, 52.68s/it, loss=0.361]


# We’ve fine-tuned our model using both MLM and NSP!