In [1]:
import torch
from transformers import BertTokenizer, BertForPreTraining

In [44]:
tokenizer = BertTokenizer.from_pretrained('/data/wjb/pretrained_weight/bert-base-japanese')
model = BertForPreTraining.from_pretrained('/data/wjb/pretrained_weight/bert-base-japanese')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForPreTraining were not initialized from the model checkpoint at /data/wjb/pretrained_weight/bert-base-japanese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
with open('/data/scarlett/Japan/train','r') as fp:
    text = fp.read().split('\n')

### NSP

In [61]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('。') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        for i in range(10):
            start = random.randint(0, num_sentences-2)
            # 50/50 whether is IsNextSentence or NotNextSentence
            if random.random() >= 0.5:
                # this is IsNextSentence
                sentence_a.append(sentences[start])
                sentence_b.append(sentences[start+1])
                label.append(0)
            else:
                index_neg_text = random.randint(0, len(text)-1)
                neg_text = text[index_neg_text]
                neg_sentences = neg_text.split('。')
                index_neg_sentence = random.randint(0, len(neg_sentences)-1)
                # this is NotNextSentence
                sentence_a.append(sentences[start])
                sentence_b.append(neg_sentences[index_neg_sentence])
                label.append(1)

In [62]:
len(label)

99270

In [6]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [7]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [9]:
inputs['labels'] = inputs.input_ids.detach().clone()

### MLM

In [47]:
# create random array of floats with equal dimensions to input_ids tensor mask=1, 
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 2) * \
           (inputs.input_ids != 3) * (inputs.input_ids != 0)

In [53]:
print(tokenizer.convert_ids_to_tokens(4))

[MASK]


In [12]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  """


In [51]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 4

In [52]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

### dataset

In [15]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [17]:
dataset = MeditationsDataset(inputs)

In [35]:
loader_train = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
loader_test = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)de

### set device

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
#model.to(device)

NameError: name 'torch' is not defined

### set optimizer

In [37]:
from transformers import AdamW
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [38]:
len(loader_train)

3

In [41]:
from tqdm.notebook import tqdm  # for our progress bar

epochs = 10

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    #loop = tqdm(loader, leave=True)
    #for batch in loop:
    for i,batch in enumerate(loader_train):
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        print('train_loss:', loss)
        if i%3==0:
            loss_all_eval = 0
            for i_eval,batch_eval in enumerate(loader_test):
                input_ids_eval = batch_eval['input_ids'].to(device)
                token_type_ids_eval = batch_eval['token_type_ids'].to(device)
                attention_mask_eval = batch_eval['attention_mask'].to(device)
                next_sentence_label_eval = batch_eval['next_sentence_label'].to(device)
                labels_eval = batch_eval['labels'].to(device)
                outputs_eval = model(input_ids_eval, 
                                     attention_mask = attention_mask_eval,
                                     token_type_ids = token_type_ids_eval,
                                     next_sentence_label = next_sentence_label_eval,
                                     labels = labels_eval)
                loss_eval = outputs_eval.loss
                loss_all_eval += loss_eval
            print('eval_loss:', loss_all_eval/(i_eval+1))
                
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        #loop.set_description(f'Epoch {epoch}')
        #loop.set_postfix(loss=loss.item())

  """


train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
eval_loss: tensor(0.0003, device='cuda:0', grad_fn=<DivBackward0>)
train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
eval_loss: tensor(0.0003, device='cuda:0', grad_fn=<DivBackward0>)
train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0003, device='cuda:0', grad_fn=<AddBackward0>)
eval_loss: tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)
train_loss: tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>)
train_loss: tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>)
eval_loss: tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)
train_loss: tensor(0.0002, device='cuda:0', grad_fn=

In [3]:
epoch=7
str(epoch)

'7'