In [1]:
# Import all necessary libraries
import torch
from torch.optim import AdamW

from transformers import BertForMaskedLM, BertTokenizer
from transformers import Trainer, TrainingArguments

from tqdm import tqdm

In [3]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Read text from file
with open('jb_dialogue.txt', 'r') as rd:
    text = rd.read().split('\n')

In [5]:
# Get tokenized text of maximum length
max_len_ids = tokenizer(max(text), return_tensors='pt')

In [6]:
# Choose closest 2^x number as input's max length
len(max_len_ids['input_ids'][0])

49

In [7]:
# Tokenize texts
inputs = tokenizer(text, return_tensors='pt', max_length=64, truncation=True, padding='max_length')

In [8]:
# Copy set labels as copy of inputs
inputs['labels'] = inputs.input_ids.detach().clone()

In [9]:
# Show input keys
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [10]:
# Iniyialize random tensor
rand = torch.rand(inputs.input_ids.shape)
# Check if numver is less then 0.15 and exclude all separators ids
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [11]:
# Show mask
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [12]:
selection = [torch.flatten(mask_arr[i].nonzero()).tolist() for i in range(inputs.input_ids.shape[0])]

In [13]:
selection[:5]

[[6, 9, 10], [8, 17, 18], [], [1], [4, 12]]

In [14]:
# For each input in inputs
for i in range(inputs.input_ids.shape[0]):
    # Mask selected ids with 103
    inputs.input_ids[i, selection[i]] = 103

In [15]:
inputs.input_ids

tensor([[  101,  2508,  5416,  ...,     0,     0,     0],
        [  101, 13378, 14185,  ...,     0,     0,     0],
        [  101,  2508,  5416,  ...,     0,     0,     0],
        ...,
        [  101, 11338, 29370,  ...,     0,     0,     0],
        [  101, 19757,   103,  ...,     0,     0,     0],
        [  101,  7151, 16704,  ...,     0,     0,     0]])

In [16]:
# Initialize datoader
class QuotesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, indx):
        return {key: torch.tensor(val[indx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [17]:
# Initialize instance of dataloader
dataset = QuotesDataset(inputs)

In [18]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [19]:
# Chose available device to train model on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Push model to device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [20]:
# Initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)

In [21]:
epochs = 2
# Train model
for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[indx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 118/118 [16:25<00:00,  8.35s/it, loss=0.346]
Epoch 1: 100%|██████████| 118/118 [16:15<00:00,  8.27s/it, loss=0.146]


In [22]:
# Initialize training arguments
args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=16,
    num_train_epochs=2
)
# Initialize HuggingFace trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

In [23]:
# Train model with trainer
trainer.train()

***** Running training *****
  Num examples = 1877
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 236
  Number of trainable parameters = 109514298


  0%|          | 0/236 [00:00<?, ?it/s]

  return {key: torch.tensor(val[indx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 2013.6732, 'train_samples_per_second': 1.864, 'train_steps_per_second': 0.117, 'train_loss': 0.09186936233003261, 'epoch': 2.0}


TrainOutput(global_step=236, training_loss=0.09186936233003261, metrics={'train_runtime': 2013.6732, 'train_samples_per_second': 1.864, 'train_steps_per_second': 0.117, 'train_loss': 0.09186936233003261, 'epoch': 2.0})