In [None]:
!unzip masked_validation_dataset.zip

In [20]:
from transformers import BertTokenizer, BertForMaskedLM
from datasets import Dataset
import torch

In [2]:
torch.cuda.empty_cache()

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
ds = Dataset.load_from_disk("masked_training_dataset")
ds_eval = Dataset.load_from_disk("masked_validation_dataset")

In [5]:
tokenized_lengths = [len(tokenizer.encode(s)) for s in ds['text']]

# Find the maximum length
max_length = max(tokenized_lengths)
max_length

35

In [6]:
inputs = tokenizer(ds['text'], return_tensors='pt', max_length=35, truncation=True, padding='max_length')
inputs_eval = tokenizer(ds_eval['text'], return_tensors='pt', max_length=35, truncation=True, padding='max_length')

In [7]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs_eval['labels'] = inputs.input_ids.detach().clone()

In [8]:
labels = tokenizer(ds["label"], add_special_tokens=False)
labels = [item for sublist in labels['input_ids'] for item in sublist]

labels_eval = tokenizer(ds["label"], add_special_tokens=False)
labels_eval = [item for sublist in labels_eval['input_ids'] for item in sublist]

In [9]:
for tensor in inputs['labels']:
    for n, token in enumerate(tensor):
        if token.item() == tokenizer.mask_token_id:
            tensor[n] = torch.tensor(labels.pop(0))

for tensor in inputs_eval['labels']:
    for n, token in enumerate(tensor):
        if token.item() == tokenizer.mask_token_id:
            tensor[n] = torch.tensor(labels_eval.pop(0))

In [10]:
tokenizer.decode(inputs['input_ids'][0])

2024-03-27 15:16:00.142271: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'[CLS] some plant varieties can grow up to [MASK] feet tall. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [11]:
inputs['input_ids'][0]
inputs['labels'][0]
tokenizer.decode(3157)

'n i n e'

In [12]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
dataset = CustomDataset(inputs)
dataset_eval = CustomDataset(inputs)

In [14]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
dataloader_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=16, shuffle=True)

In [15]:
#Training
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [16]:
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [17]:
from torch.optim import AdamW
optim = AdamW(model.parameters(), lr=1e-5)

In [18]:
import torch
torch.cuda.is_available()


True

In [None]:
from tqdm import tqdm
epochs = 70
step = 0
train_loss = []
eval_loss = []
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

    model.save_pretrained(f'./checkpoint/bert_epoch_{epoch}')
    tokenizer.save_pretrained(f'./checkpoint/bert_epoch_{epoch}')

    model.eval()
    loop_2 = tqdm(dataloader_eval, leave=True)
    for batch in loop_2:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss_eval = outputs.loss
        loop_2.set_description(f'Epoch {epoch}')
        loop_2.set_postfix(loss=loss_eval.item())

    train_loss.append(loss)
    eval_loss.append(loss_eval)
    model.train()
        

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 653/653 [00:38<00:00, 16.90it/s, loss=0.0571]
Epoch 0: 100%|██████████| 653/653 [00:12<00:00, 50.39it/s, loss=0.0347]
Epoch 1: 100%|██████████| 653/653 [00:38<00:00, 16.92it/s, loss=0.0463]
Epoch 1: 100%|██████████| 653/653 [00:12<00:00, 50.29it/s, loss=0.0328]
Epoch 2: 100%|██████████| 653/653 [00:38<00:00, 16.82it/s, loss=0.0297]
Epoch 2: 100%|██████████| 653/653 [00:13<00:00, 50.19it/s, loss=0.0303]
Epoch 3: 100%|██████████| 653/653 [00:38<00:00, 16.79it/s, loss=0.021] 
Epoch 3: 100%|██████████| 653/653 [00:13<00:00, 50.11it/s, loss=0.0319]
Epoch 4: 100%|██████████| 653/653 [00:38<00:00, 16.88it/s, loss=0.0458]
Epoch 4: 100%|██████████| 653/653 [00:12<00:00, 50.41it/s, loss=0.0297] 
Epoch 5: 100%|██████████| 653/653 [00:38<00:00, 16.83it/s, loss=0.0327]
Epoch 5: 100%|██████████| 653/653 [00:13<00:00, 49.69it/s, loss=0.0107] 
Epoch 6: 100%|██████████| 653/653 [00:39<00:00, 16.59it/

In [24]:
ds_eval = Dataset.load_from_disk("masked_validation_dataset")
eval_list = []
for n, line in enumerate(ds_eval['text']):
    eval_list.append(line.replace("<mask>", "[MASK]"))

In [None]:
from transformers import pipeline
def evalModel(path, eval_list, ground_truth):
    fill = pipeline('fill-mask', model=path, tokenizer=path)
    
    results = []
    for line in eval_list:
        results.append(fill(line)[0]['token_str'])

    num_correct = 0
    total_elements = len(results)
    for i in range(total_elements):
        if results[i] == ground_truth[i]:
            num_correct += 1

    proportion_correct = num_correct / total_elements
    return proportion_correct

In [23]:
eval_acc=[]
for n in range(epochs):
    model_path = f"./checkpoint/bert_epoch_{n}"
    eval_acc.append(evalModel(model_path, eval_list, ds_eval['label']))
print(eval_acc)

NameError: name 'evalModel' is not defined

In [26]:
from transformers import pipeline
path='./checkpoint/bert_epoch_9'

fill = pipeline('fill-mask', model=path, tokenizer=path)
    
results = []
for line in eval_list:
    results.append(fill(line)[0]['token_str'])

num_correct = 0
total_elements = len(results)
for i in range(total_elements):
    if results[i] == ds_eval['label'][i]:
        num_correct += 1
print(num_correct)
print(total_elements)
proportion_correct = num_correct / total_elements
proportion_correct

91
200


0.455

In [None]:
sequence = f"Positive numbers are more than [MASK] and negative numbers are less than zero."

fill(sequence)[0]['token_str']

In [None]:
ds_eval = Dataset.load_from_disk("masked_validation_dataset")
eval_list = []
for n, line in enumerate(ds_eval['text']):
    eval_list.append(line.replace("<mask>", "[MASK]"))


In [None]:
results = []
for line in eval_list:
    results.append(fill(line)[0]['token_str'])
    

In [None]:
ds_eval['label'][0]

In [None]:
num_correct = 0
total_elements = len(results)

for i in range(total_elements):
    if results[i] == ds_eval['label'][i]:
        num_correct += 1

proportion_correct = num_correct / total_elements
proportion_correct