In [23]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
import requests

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [25]:
with open('./data/clean.txt','r') as fp:
    text = fp.read().split('\n')

In [26]:
text[:3]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [27]:
bag = [sentence for para in text for sentence in para.split('.') if sentence !='']

In [28]:
bag_size = len(bag)

In [29]:
bag_size

1372

In [31]:
import random

sentence_a = []
sentence_b = []
label =[]
for paragraph in text:
    sentences =[
        sentence for sentence in paragraph.split('.') if sentence !=''
    ]
    num_sentences = len(sentences)
    if num_sentences >1:
        start = random.randint(0,num_sentences-2)
        sentence_a.append(sentences[start])
        if random.random() > 0.5:
            sentence_b.append(bag[random.randint(0,bag_size-1)])
            label.append(1)
        else:
            sentence_b.append(sentences[start+1])
            label.append(0)

In [35]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n---')

0
 He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved
---
 I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man
---
1
 There was in him nothing harsh, nor implacable, nor violent, nor, as one may say, anything carried to the sweating point; but he examined all things severally, as if he had abundance of time, and without confusion, in an orderly way, vigorously and consistently
---
About fame: Look at the minds of those who seek fame, observe what they are, and what kind of things they avoid, and what kind of things they pursue
---
1
To the gods I am indebted for having good grandfathers, good parents, a good sister, good teachers, good associates, good kinsmen and friends, nearly everything good
---
 But if usage has especially fixed these ter

In [36]:
inputs = tokenizer(sentence_a,sentence_b,return_tensors='pt',max_length=512,truncation=True,padding='max_length')

In [37]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [42]:
inputs

{'input_ids': tensor([[  101,  2002,  2001,  ...,     0,     0,     0],
        [  101,  2045,  2001,  ...,     0,     0,     0],
        [  101,  2000,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [46]:
inputs['labels'] = torch.LongTensor(label).unsqueeze(0).T

In [49]:
class MeditationDataset(torch.utils.data.Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
        
    def __getitem__(self,idx):
        return{key: tensor[idx] for key,tensor in self.encodings.items()}
    def __len__(self):
        return self.encodings.input_ids.shape[0]

In [50]:
dataset = MeditationDataset(inputs)

In [51]:
dataloader = torch.utils.data.DataLoader(dataset,batch_size=2,shuffle=True)

In [52]:
from transformers import AdamW
optim = AdamW(model.parameters(),lr=1e-5)



In [54]:
from tqdm import tqdm

epochs =2 
for epoch in range(epochs):
    loop = tqdm(dataloader,leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        outputs = model(input_ids,attention_mask = attention_mask,labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        
        loop.set_description(f'epoch {epoch}')
        loop.set_postfix(loss=loss.item())

epoch 0: 100%|██████████| 159/159 [32:15<00:00, 12.17s/it, loss=1.06] 
epoch 1: 100%|██████████| 159/159 [33:01<00:00, 12.46s/it, loss=0.845] 
