# Masked Language Model.

In MLM we input a sentence and optimize model weight to get the same output. But before sending input we randomly masked some tokens with [Mask] token. To train the MLM model we need to follow few steps
* Tokenize input.
* Create a label with input or clone input tensor.
* Randomly masked some token in input.
* Initialize the model and calculate the loss.
* Finally update weight.

In [None]:
!pip install torch

# Tokenizer and Dataset

We will be using BertTokenizer from transformers and create our own PyTorch Dataset class.

In [1]:
!pip install --quiet git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 67 kB 4.6 MB/s 
[K     |████████████████████████████████| 596 kB 33.1 MB/s 
[K     |████████████████████████████████| 6.8 MB 56.7 MB/s 
[K     |████████████████████████████████| 895 kB 69.9 MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_basic_tokenization = True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

def data_collate_fn(dataset_samples_list):
    arr = np.array(dataset_samples_list)
    inputs = tokenizer(text=arr.tolist(), padding='max_length', max_length=30, return_tensors='pt')
    return inputs

class MyDataset(Dataset):
    def __init__(self, src, tokenizer):
        self.src = src
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src = self.src[idx]
        return src

# Model


In [4]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

device = "cuda" if torch.cuda.is_available() else "cpu"


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [78]:
def train(model, dataloader):
    model.train()
    epochs = 5000
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.AdamW(model.parameters(), lr=0.0001)

    for epoch in range(epochs):
        for batch in dataloader:
            optim.zero_grad()
            input = batch['input_ids'].clone()
            src_mask = model.generate_square_subsequent_mask(batch['input_ids'].size(1))
            rand_value = torch.rand(batch.input_ids.shape)
            rand_mask = (rand_value < 0.15) * (input != 101) * (input != 102) * (input != 0)
            mask_idx=(rand_mask.flatten() == True).nonzero().view(-1)
            input = input.flatten()
            input[mask_idx] = 103
            input = input.view(batch['input_ids'].size())

            out = model(input.to(device), src_mask.to(device))
            loss = criterion(out.view(-1, ntokens), batch['input_ids'].view(-1).to(device))
            total_loss += loss
            loss.backward()
            optim.step()
    
        if (epoch+1)%40==0 or epoch==0:
            print("Epoch: {} -> loss: {}".format(epoch+1, total_loss/(len(dataloader)*epoch+1)))


def predict(model, input):
    model.eval()
    src_mask = model.generate_square_subsequent_mask(input.size(1))
    out = model(input.to(device), src_mask.to(device))
    out = out.topk(1).indices.view(-1)
    return out

Some texts for train model. Creating dataset and dataloader instances.

In [79]:
text = [
"तू कसा आहेस",
"तू आज कुठे जात आहेस",
"तुमचा दिनक्रम काय आहे",
"तुम्ही कुठून आलात?",
"तुम्ही कोणत्या हायस्कूलमध्ये शिकलात?",
"तुम्ही कशात मेजर जाणार आहात?",
"तुम्ही या शाळेत यायचे का ठरवले?",
]

dataset = MyDataset(text, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=data_collate_fn)

Initialize model

In [81]:
ntokens = tokenizer.vocab_size # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [82]:
# Training Model

train(model, dataloader)

Epoch: 1 -> loss: 10.621495246887207
Epoch: 40 -> loss: 7.28769063949585
Epoch: 80 -> loss: 5.736351013183594
Epoch: 120 -> loss: 4.790868282318115
Epoch: 160 -> loss: 4.071025371551514
Epoch: 200 -> loss: 3.5004560947418213
Epoch: 240 -> loss: 3.0487160682678223
Epoch: 280 -> loss: 2.6950135231018066
Epoch: 320 -> loss: 2.411710500717163
Epoch: 360 -> loss: 2.1836118698120117
Epoch: 400 -> loss: 1.9982519149780273
Epoch: 440 -> loss: 1.8427908420562744
Epoch: 480 -> loss: 1.7116385698318481
Epoch: 520 -> loss: 1.598446249961853
Epoch: 560 -> loss: 1.5008268356323242
Epoch: 600 -> loss: 1.4150335788726807
Epoch: 640 -> loss: 1.3409045934677124
Epoch: 680 -> loss: 1.2743587493896484
Epoch: 720 -> loss: 1.215354561805725
Epoch: 760 -> loss: 1.1613343954086304
Epoch: 800 -> loss: 1.1122390031814575
Epoch: 840 -> loss: 1.06877601146698
Epoch: 880 -> loss: 1.0286227464675903
Epoch: 920 -> loss: 0.9918306469917297
Epoch: 960 -> loss: 0.9577449560165405
Epoch: 1000 -> loss: 0.9262234568595886

In [88]:
# Predict

print("Input: {}".format(text[2]))
pred_inp = tokenizer("तू आज कुठे जात [MASK]", return_tensors='pt')
out = predict(model, pred_inp['input_ids'])
print("Output: {}\n".format(tokenizer.decode(out)))

Input: तुमचा दिनक्रम काय आहे
Output: [CLS] त आज [UNK] जात आ [SEP]

