- Tokenize input.
- Create a label with input or clone input tensor.
- Randomly masked some token in input.
- Initialize the model and calculate the loss.
- Finally update weight.

In [1]:
import torch
import math
import numpy as np
from transformers import BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_basic_tokenization = True)

In [2]:
def data_collate_fn(dataset_samples_list):
    # Makes the dataset a numpy array instead of a python list
    arr = np.array(dataset_samples_list)
    
    # Tokenizes the data using the BERT tokenizer
    inputs = tokenizer(text=arr.tolist(), padding='max_length', max_length=30, return_tensors='pt')
    return inputs

class MyDataset(Dataset):
    def __init__(self, txt, tokenizer):
        self.txt = txt
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.txt)

    def __getitem__(self, idx):
        txt = self.txt[idx]
        return txt

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        # Initializes the parameters od the model
        super(TransformerModel, self).__init__()
        # Initializes model type
        self.model_type = 'Transformer'
        # Gets the positional encoder 
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # Sets the laters based on the hyperparameters
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first=True)
        # 
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        # Creates an attention mask with zeros across the diagonal
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        # Initializes the weights of the model between the modes before training the dataset
        # Sets the range in which the values of the weights can occur
        initrange = 0.1
        # Sets the weights of the encoder
        self.encoder.weight.data.uniform_(-initrange, initrange)
        # Zeros the weights of the decoder
        self.decoder.bias.data.zero_()
        # Sets the weights of the decoder
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, txt, txt_mask):
        txt = self.encoder(txt) * math.sqrt(self.ninp)
        txt = self.pos_encoder(txt)
        output = self.transformer_encoder(txt, txt_mask)
        # Decodes the processed output
        output = self.decoder(output)
        return output


In [4]:
def train(model, dataloader):
    model.train()
    epochs = 500
    total_loss = 0
    # Sets how loss will be calcuated using Cross Entropy
    criterion = nn.CrossEntropyLoss()
    # Sets the optimizer to be AdamW
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
    
    for epoch in range(epochs):
        for batch in dataloader:
            # Zeroes the gradient used for gradient descent
            optimizer.zero_grad()
            # Gets the input ids from the batch of the dataset
            input = batch['input_ids'].clone()
            # Generates a square subsequent mask of the batch of the dataset
            txt_mask = model.generate_square_subsequent_mask(batch['input_ids'].size(1))
            # Creates an array the size of input id's to mask out 15% of the dataset
            rand_value = torch.rand(batch.input_ids.shape)
            # This adds parameters to the random mask to make sure special tokens do not get masked
            rand_mask = (rand_value < 0.15) * (input != 101) * (input != 102) * (input != 0)
            # This gets the indexes of all of the tokens to be masked from the dataset
            mask_idx = (rand_mask.flatten() == True).nonzero().view(-1)
            # The inputs are flattened for multiplication
            input = input.flatten()
            # Sets which token is the [MASK] token
            input[mask_idx] = 103
            input = input.view(batch['input_ids'].size())
            
            # Gets the output by entering the masked input and unmasked input into the model infrastructure
            out = model(input.to(device), txt_mask.to(device))
            # Calculated the loss of the model (-log p(word|sentence))
            loss = criterion(out.view(-1, ntokens), batch['input_ids'].view(-1).to(device))
            # Adds the loss of each batch to the total loss
            total_loss += loss
            # Does backward propagation
            loss.backward()
            # Makes the optimizer take a step
            optimizer.step()
    
        # Prints the total loss and which step the model is on every 40 epochs
        if (epoch+1)%40==0 or epoch==0:
            print("Epoch: {} -> loss: {}".format(epoch+1, total_loss/(len(dataloader)*epoch+1)))
            
def predict(model, input):
    # Uses model.eval to get
    model.eval()
    # Generates a square subsequent mask for the example input
    txt_mask = model.generate_square_subsequent_mask(input.size(1))
    # Inputs the square subsequent mask of the input and the input into the model
    out = model(input.to(device), txt_mask.to(device))
    # Gets the word with the highest probability from the tokens
    out = out.topk(1).indices.view(-1)
    return out

In [5]:
text = ["Don't speak ill of others.",
"To speak ill of others is a great crime.",
"Rather rectify yourself through self-criticism.",
"In this way, if you rectify yourself, others will follow you.",
"To speak ill of others gives rise to more problems.",
"This does not do any good to society.",
"More than 80 percent people of our country live in villages.",
"Most of them are poor and illiterate.",
"Illiteracy is one of the causes of their poverty.",
"Many of the villagers are landless cultivators.",
"They cultivate the lands of other people throughout the year.",
"They get a very small portion of the crops.",
"They provide all of us with food.",
"But in want they only starve.",
"They suffer most.",
"The situation needs to be changed.",
"We live in the age of science.",
"We can see the influence of science everywhere.",
"Science is a constant companion of our life.",
"We have made the impossible things possible with the help of science.",
"Modern civilization is a contribution of science.",
"Science should be devoted to the greater welfare of mankind.",
"Rabindranath Tagore got the Nobel Prize in 1913 which is 98 years ago from today.",
"He was awarded this prize for the translation of the Bengali 'Gitanjali' into English.",
"This excellent rendering was the poet's own.",
"In the English version of Gitanjali there are 103 songs."]

dataset = MyDataset(text, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=data_collate_fn)

In [6]:
ntokens = tokenizer.vocab_size # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [7]:
train(model, dataloader)

Epoch: 1 -> loss: 65.35458374023438


KeyboardInterrupt: 

In [None]:
print("Input: {}".format(text[0]))
pred_inp = tokenizer("[MASK] are an orange fruit.", return_tensors='pt')
out = predict(model, pred_inp['input_ids'])
print("Output: {}\n".format(tokenizer.decode(out)))