In [1]:
from transformers import BertForNextSentencePrediction, BertTokenizer
import torch
from torch.optim import AdamW
import random
from tqdm import tqdm

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
with open('All_Quiet_on_the_Western_Front.txt', 'r') as fp:
    text = fp.read().split('\n')

In [13]:
while '' in text:
    text.remove('')

In [16]:
text[:5]

['п»їThis book is to be neither an accusation nor a confession, and least of all an adventure, for death is not an adventure to those who stand face to face with it. It will try simply to tell of a generation of men who, even though they may have escaped its shells, were destroyed by the war.',
 "We are at rest five miles behind the front. Yesterday we were relieved, and now our bellies are full of beef and haricot beans. We are satisfied and at peace. Each man has another mess-tin full for the evening; and, what is more, there is a double ration of sausage and bread. That puts a man in fine trim. We have not had such luck as this for a long time. The cook with his carroty head is begging us to eat; he beckons with his ladle to every one that passes, and spoons him out a great dollop. He does not see how he can empty his stew-pot in time for coffee. Tjaden and MГјller have produced two washbasins and had them filled up to the brim as a reserve. In Tjaden this is voracity, in MГјller it

In [20]:
bag = [item for scentence in text for item in scentence.split('.') if item !='']
bag_size = len(bag)

In [22]:
sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [23]:
for i in range(5):
    print(f"{label[i]}\n{sentence_a[i]}\n{sentence_b[i]}")

0
п»їThis book is to be neither an accusation nor a confession, and least of all an adventure, for death is not an adventure to those who stand face to face with it
 It will try simply to tell of a generation of men who, even though they may have escaped its shells, were destroyed by the war
1
 What's more important still is the issue of a double ration of smokes
 There is a sort of hearth, an iron plate set on some bricks
0
It is true we have no right to this windfall
 The Prussian is not so generous
0
 It was fairly quiet on our sector, so the quartermaster who remained in the rear had requisitioned the usual quantity of rations and provided for the full company of one hundred and fifty men
 But on the last day an astonishing number of English heavies opened up on us with high-explosive, drumming ceaselessly on our position, so that we suffered severely and came back only eighty strong
1
Last night we moved back and settled down to get a good sleep for once: Katczinsky is right when 

In [25]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([label]).T

In [26]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [27]:
dataset = MeditationsDataset(inputs)

In [28]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [29]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [30]:
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)
# set number of epochs
epochs = 2

In [31]:
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 0/57 [01:12<?, ?it/s]


KeyboardInterrupt: 