- Tokenize input.
- Create a label with input or clone input tensor.
- Randomly masked some token in input.
- Initialize the model and calculate the loss.
- Finally update weight.

In [1]:
# Import all necessary libraries
import torch
import math
import re
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer

# See if there are any GPUs avaliable to train the model
device = "cuda" if torch.cuda.is_available() else "cpu"

# Retrieve the bert tokenizer from the transformers library
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_basic_tokenization = True)

In [2]:
# Class definition for the storage of the Dataset
class DatasetClass(Dataset):
    def __init__(self, txt, tokenizer):
        # Retrieves the text 
        self.txt = txt
        # Retrieves the tokenizer
        self.tokenizer = tokenizer

    def __len__(self):
        # Returns the length of the text
        return len(self.txt)

    def __getitem__(self, idx):
        # Gets a specific line from the text
        txt = self.txt[idx]
        return txt

In [3]:
# Class definition fot the positional encoding on an embedding of a token
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Sets the dropout rate to be the one passed in as a parameter
        self.dropout = nn.Dropout(p=dropout)
        
        # Creates a tensor of zeros that has the shape of the model's dimensions
        pe = torch.zeros(max_len, d_model)
        # Creates an unsqeezed tensor with the position of the of the word within the sentence its in
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Gets the sine value of the position of the word within the sentence its in
        pe[:, 0::2] = torch.sin(position * div_term)
        # Gets the cosine value of the position of the word within the sentence its in
        pe[:, 1::2] = torch.cos(position * div_term)
        # Sqeezes the positional encoding back to its original shape and returns the transpose for matrix multiplication
        pe = pe.unsqueeze(0).transpose(0, 1)
        # Saves the positional encoding
        self.register_buffer('pe', pe)

    def forward(self, txt):
        # Adds the positional encoding information to the text token
        txt = txt + self.pe[:txt.size(0), :]
        # Returns the forward ste and drops out random nodes in the network
        return self.dropout(txt)

In [4]:
# Class definition for the transformer model
class TransformerModel(nn.Module):

    def __init__(self, ntoken, num_input, heads, hidden, layers, dropout=0.5):
        # Initializes the parameters od the model
        super(TransformerModel, self).__init__()
        # Initializes model type
        self.model_type = 'Transformer'
        # Creates an instance of the positional encoder
        self.pos_encoder = PositionalEncoding(num_input, dropout)
        # Creates the encoder layers based on the tranmsformer hyperparameters
        encoder_layers = TransformerEncoderLayer(num_input, heads, hidden, dropout, batch_first=True)
        # Adds the encoder layers into the encoder
        self.transformer_encoder = TransformerEncoder(encoder_layers, layers)
        # Passes information about the inpot into the encoder
        self.encoder = nn.Embedding(ntoken, num_input)
        # Sets the number of inputs
        self.num_input = num_input
        # Creates the decorder layer using the Pytorch Linear function
        self.decoder = nn.Linear(num_input, ntoken)
        # Initialized the weights of the transformer model
        self.init_weights()

    def generate_square_subsequent_mask(self, size):
        # Creates an attention mask with zeros across the diagonal
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        # Sets the values of the attention mask so that the current word the model is trying to predist is masked out
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        # Sets the range in which the values of the weights can occur
        initrange = 0.1
        # Sets the weights of the encoder
        self.encoder.weight.data.uniform_(-initrange, initrange)
        # Zeros the weights of the decoder
        self.decoder.bias.data.zero_()
        # Sets the weights of the decoder
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, txt, txt_mask):
        # Multiplies the information from the encoder by the square root of the number of inputs
        txt = self.encoder(txt) * math.sqrt(self.num_input)
        # Gets the information about the positional encoding of the current text being processed
        txt = self.pos_encoder(txt)
        # Gets the output to be input into the decoder using the encoder
        output = self.transformer_encoder(txt, txt_mask)
        # Decodes the processed output
        output = self.decoder(output)
        return output


In [5]:
def data_collate_fn(dataset_samples_list):
    # Makes the dataset a numpy array instead of a python list
    arr = np.array(dataset_samples_list)
    
    # Tokenizes the data using the BERT tokenizer
    inputs = tokenizer(text=arr.tolist(), padding='max_length', max_length=6000, return_tensors='pt')
    
    # Returns the inputs to the model
    return inputs

def predict(model, input):
    # Uses model.eval to get
    model.eval()
    # Generates a square subsequent attention mask for the example input
    txt_mask = model.generate_square_subsequent_mask(input.size(1))
    # Inputs the square subsequent mask of the input and the input into the model to predict all the masked tokens
    out = model(input.to(device), txt_mask.to(device))
    # Gets the word with the highest probability from the tokens
    out = out.topk(1).indices.view(-1)
    return out

def mask_random(model, batch, token_list, mask_token, mask_percent=0.15):
    # Gets the input ids from the batch of the dataset
    input = batch['input_ids'].clone()
    # Generates a square subsequent mask of the batch of the dataset
    txt_mask = model.generate_square_subsequent_mask(batch['input_ids'].size(1))
    # Creates an array the size of input id's with random values between 0 and 1
    rand_value = torch.rand(batch.input_ids.shape)
    # Creates a random masking matrix where only values with a vlaue lower then the set percentage are kept (15%)
    rand_mask = (rand_value < mask_percent)
    # Adds parameters to the random mask to make sure special tokens do not get masked
    for token in token_list:
        rand_mask = rand_mask * (input != token)
    # Gets the indexes of all of the tokens to be masked from the dataset
    mask_idx = (rand_mask.flatten() == True).nonzero().view(-1)
    # The inputs are flattened for multiplication
    input = input.flatten()
    # Sets which token is the [MASK] token (103 in this case)
    input[mask_idx] = mask_token
    # Sets the input to be the size of the input id's of the current batch 
    input = input.view(batch['input_ids'].size())
    
    return input, txt_mask

def train(model, dataloader, epochs=500):
    model.train()
    total_loss = 0
    # Sets how loss will be calcuated using Cross Entropy
    criterion = nn.CrossEntropyLoss()
    # Sets the optimizer to be AdamW
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
    
    # Runs the model on the designated number of epochs
    for epoch in range(epochs):
        # Separates the data into batches
        for batch in dataloader:
            # Zeroes the gradient used for gradient descent
            optimizer.zero_grad()
            # Sets what the mask token is
            mask_token = 103
            # Sets what the special tokens are
            token_list = [101, 102, 0]
            input, txt_mask = mask_random(model, batch, token_list, mask_token, 0.15)
            # Gets the output by entering the masked input and unmasked input into the model infrastructure
            out = model(input.to(device), txt_mask.to(device))
            # Calculated the loss of the model (-log p(word|sentence))
            loss = criterion(out.view(-1, vocab_size), batch['input_ids'].view(-1).to(device))
            # Adds the loss of each batch to the total loss
            total_loss += loss
            # Does backward propagation
            loss.backward()
            # Makes the optimizer take a step
            optimizer.step()
    
        # Prints the total loss and which step the model is on every 40 epochs
        print("Epoch: {} -> loss: {}".format(epoch+1, total_loss/(len(dataloader)*epoch+1)))

In [6]:
dataset = load_dataset("rotten_tomatoes")
train_text = dataset['train']['text']

for i, sentence in enumerate(train_text):
    new_sentence = re.sub(r"[^a-zA-Z0-9 ]", "", sentence)
    train_text[i] = new_sentence
    
train_text = train_text[: -3530]

Found cached dataset rotten_tomatoes (/Users/dominikzeman/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset = DatasetClass(train_text, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=data_collate_fn)

In [8]:
# The size of vocabulary
vocab_size = tokenizer.vocab_size
# The number of epochs the model needs to train for
epochs = 100
# The size of the embedding dimension
embedding = 100
# The dimension of the hidden layer in the network
hidden = 200
# The number of Transformer layers in the network
layers = 2
# The number of heads in the models for multihead attention
heads = 2
# The percentage of nodes that will drop out to prevent overfitting 
dropout = 0.2
# Creates an instance of the model with the specified hyperparameters
model = TransformerModel(vocab_size, embedding, heads, hidden, layers, dropout).to(device)

In [9]:
# Trains the model
train(model, dataloader, epochs)

KeyboardInterrupt: 

In [None]:
# Saves the trained masked model
torch.save(model, 'models')

In [None]:
# Loads the trained model
#model = torch.load('models')

print("Input: {}".format(text[0]))
pred_inp = tokenizer("Illiterate [MASK] get in the way of science.", return_tensors='pt')
out = predict(model, pred_inp['input_ids'])
print("Output: {}\n".format(tokenizer.decode(out)))