In [None]:
!unzip masked_validation_dataset.zip

In [37]:
from transformers import BertTokenizer, BertForMaskedLM
from datasets import Dataset
import torch

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
ds = Dataset.load_from_disk("masked_training_dataset")

In [40]:
tokenized_lengths = [len(tokenizer.encode(s)) for s in ds['text']]

# Find the maximum length
max_length = max(tokenized_lengths)
max_length

35

In [41]:
inputs = tokenizer(ds['text'], return_tensors='pt', max_length=35, truncation=True, padding='max_length')

In [42]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [43]:
labels = tokenizer(ds["label"], add_special_tokens=False)
labels = [item for sublist in labels['input_ids'] for item in sublist]

In [44]:
for tensor in inputs['labels']:
    for n, token in enumerate(tensor):
        if token.item() == tokenizer.mask_token_id:
            tensor[n] = torch.tensor(labels.pop(0))

In [45]:
tokenizer.decode(inputs['input_ids'][0])

'[CLS] some plant varieties can grow up to [MASK] feet tall. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [46]:
inputs['input_ids'][0]
inputs['labels'][0]
tokenizer.decode(3157)

'n i n e'

In [47]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [48]:
dataset = CustomDataset(inputs)

In [49]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

In [50]:
#Training

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [51]:
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [52]:
from torch.optim import AdamW
optim = AdamW(model.parameters(), lr=1e-5)

In [53]:
import torch
torch.cuda.is_available()


True

In [54]:
writer = torch.utils.tensorboard.SummaryWriter()

In [55]:
from tqdm import tqdm
epochs = 20
step = 0
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        writer.add_scalar('Loss/train', loss, step)
        
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

    model.save_pretrained(f'./checkpoint/bert_epoch_{epoch}')
    tokenizer.save_pretrained(f'./checkpoint/bert_epoch_{epoch}')
        

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0:   1%|          | 1/82 [00:00<00:35,  2.30it/s, loss=11]


OutOfMemoryError: CUDA out of memory. Tried to allocate 522.00 MiB (GPU 0; 15.77 GiB total capacity; 6.48 GiB already allocated; 310.38 MiB free; 6.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
ds_eval = Dataset.load_from_disk("masked_validation_dataset")
eval_list = []
for n, line in enumerate(ds_eval['text']):
    eval_list.append(line.replace("<mask>", "[MASK]"))

In [None]:
from transformers import pipeline
def evalModel(path, eval_list, ground_truth):
    fill = pipeline('fill-mask', model=path, tokenizer=path)
    
    results = []
    for line in eval_list:
        results.append(fill(line)[0]['token_str'])

    num_correct = 0
    total_elements = len(results)
    for i in range(total_elements):
        if results[i] == ground_truth[i]:
            num_correct += 1

    proportion_correct = num_correct / total_elements
    return proportion_correct

In [26]:
eval_acc=[]
for n in range(epochs):
    model_path = f"./checkpoint/bert_epoch_{n}"
    eval_acc.append(evalModel(model_path, eval_list, ds_eval['label']))
print(eval_acc)

[0.28, 0.305, 0.305, 0.33, 0.325, 0.335, 0.335, 0.345, 0.345, 0.35, 0.355, 0.37, 0.365, 0.35, 0.35, 0.36, 0.385, 0.38, 0.38, 0.385, 0.42, 0.435, 0.43, 0.43, 0.42, 0.43, 0.435, 0.47, 0.435, 0.45, 0.445, 0.455, 0.45, 0.455, 0.455, 0.465, 0.47, 0.46, 0.46, 0.475, 0.465, 0.475, 0.48, 0.455, 0.48, 0.465, 0.485, 0.495, 0.47, 0.495, 0.475, 0.47, 0.48, 0.495, 0.48, 0.475, 0.485, 0.485, 0.5, 0.49]


In [36]:
fill = pipeline('fill-mask', model='./checkpoint/bert_epoch_19', tokenizer='./checkpoint/bert_epoch_10')
    
results = []
for line in eval_list:
    results.append(fill(line)[0]['token_str'])

num_correct = 0
total_elements = len(results)
for i in range(total_elements):
    if results[i] == ds_eval['label'][i]:
        num_correct += 1
print(num_correct)
print(total_elements)
proportion_correct = num_correct / total_elements
proportion_correct

99
200


0.495

0.28

In [None]:
sequence = f"Positive numbers are more than [MASK] and negative numbers are less than zero."

fill(sequence)[0]['token_str']

In [None]:
ds_eval = Dataset.load_from_disk("masked_validation_dataset")
eval_list = []
for n, line in enumerate(ds_eval['text']):
    eval_list.append(line.replace("<mask>", "[MASK]"))


In [None]:
results = []
for line in eval_list:
    results.append(fill(line)[0]['token_str'])
    

In [None]:
ds_eval['label'][0]

In [None]:
num_correct = 0
total_elements = len(results)

for i in range(total_elements):
    if results[i] == ds_eval['label'][i]:
        num_correct += 1

proportion_correct = num_correct / total_elements
proportion_correct