In [1]:
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
from numpy.random import default_rng

In [2]:
torch.cuda.empty_cache()

In [3]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# 1. Data

Source credit: https://huggingface.co/datasets/legacy-datasets/wikipedia

In [5]:
from datasets import load_dataset

dataset = load_dataset("wikipedia", "20220301.en")
dataset

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6458670
    })
})

In [6]:
dataset['train'][0] # check the first example

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latte

In [7]:
# random numnber generator
rand = default_rng(SEED)

# random index from the dataset
# reducing the size of the dataset to 100k
random_index = rand.choice(len(dataset['train']), 100000, replace=False)

dataset['train'] = dataset['train'].filter(lambda data, index: index in random_index, with_indices=True)

In [8]:
dataset # after filtering

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 100000
    })
})

In [9]:
# removing unwanted columns
dataset = dataset.remove_columns(['id', 'url', 'title'])
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100000
    })
})

In [10]:
# dataset['train'][:5]

In [11]:
sentences = dataset['train']['text']
sentences = [x.replace("\n", " ") for x in sentences]
sentences = [x for x in sentences if len(x.split()) <= 500]
sentences[:5]

['Anaxarchus (; ; ) was a Greek philosopher of the school of Democritus. Together with Pyrrho, he accompanied Alexander the Great into Asia. The reports of his philosophical views suggest that he was a forerunner of Pyrrhonism. Aelian writes that he was called Eudaemonicus or "Happy Man" ().  Life   Anaxarchus was born at Abdera in Thrace. He was the companion and friend of Alexander the Great in his Asiatic campaigns. According to Diogenes Laërtius, in response to Alexander\'s claim to have been the son of Zeus-Ammon, Anaxarchus pointed to his bleeding wound and remarked, "See the blood of a mortal, not ichor, such as flows from the veins of the immortal gods." Aelian, writes that Anaxarchus laughed at Alexander for making himself a god and said, "The hopes of our god are in a porringer of broth", when the physician prescribed a broth to Alexander.  Plutarch tells a story that at Bactra, in 327\xa0BC in a debate with Callisthenes, he advised all to worship Alexander as a god even duri

In [12]:
# dataset = load_dataset('bookcorpus', split='train[:1%]')
# dataset

In [13]:
# sentences = dataset['text']

### Making vocabs

Before making the vocabs, let's remove all question marks and perios, etc, then turn everything to lowercase, and then simply split the text. 

In [14]:
text = [x.lower() for x in sentences] #lower case
text = [re.sub("[.,!?\\-]", '', x) for x in text] #clean all symbols
text[:5]

['anaxarchus (; ; ) was a greek philosopher of the school of democritus together with pyrrho he accompanied alexander the great into asia the reports of his philosophical views suggest that he was a forerunner of pyrrhonism aelian writes that he was called eudaemonicus or "happy man" ()  life   anaxarchus was born at abdera in thrace he was the companion and friend of alexander the great in his asiatic campaigns according to diogenes laërtius in response to alexander\'s claim to have been the son of zeusammon anaxarchus pointed to his bleeding wound and remarked "see the blood of a mortal not ichor such as flows from the veins of the immortal gods" aelian writes that anaxarchus laughed at alexander for making himself a god and said "the hopes of our god are in a porringer of broth" when the physician prescribed a broth to alexander  plutarch tells a story that at bactra in 327\xa0bc in a debate with callisthenes he advised all to worship alexander as a god even during his lifetime as t

In [15]:
from tqdm.auto import tqdm

# Combine everything into one to make vocab
word_list = list(set(" ".join(text).split()))
word2id = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}  # special tokens

# Create the word2id in a single pass
for i, w in tqdm(enumerate(word_list), desc="Creating word2id"):
    word2id[w] = i + 4  # because 0-3 are already occupied

# Precompute the id2word mapping (this can be done once after word2id is fully populated)
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id)
vocab_size

Creating word2id: 598003it [00:00, 2254539.16it/s]


598007

In [16]:
# List of all tokens for the whole text
token_list = []

# Process sentences more efficiently
for sentence in tqdm(text, desc="Processing sentences"):
    token_list.append([word2id[word] for word in sentence.split()])

Processing sentences:   0%|          | 0/74796 [00:00<?, ?it/s]

Processing sentences: 100%|██████████| 74796/74796 [00:02<00:00, 25284.24it/s]


In [17]:
#take a look at sentences
sentences[:2]

['Anaxarchus (; ; ) was a Greek philosopher of the school of Democritus. Together with Pyrrho, he accompanied Alexander the Great into Asia. The reports of his philosophical views suggest that he was a forerunner of Pyrrhonism. Aelian writes that he was called Eudaemonicus or "Happy Man" ().  Life   Anaxarchus was born at Abdera in Thrace. He was the companion and friend of Alexander the Great in his Asiatic campaigns. According to Diogenes Laërtius, in response to Alexander\'s claim to have been the son of Zeus-Ammon, Anaxarchus pointed to his bleeding wound and remarked, "See the blood of a mortal, not ichor, such as flows from the veins of the immortal gods." Aelian, writes that Anaxarchus laughed at Alexander for making himself a god and said, "The hopes of our god are in a porringer of broth", when the physician prescribed a broth to Alexander.  Plutarch tells a story that at Bactra, in 327\xa0BC in a debate with Callisthenes, he advised all to worship Alexander as a god even duri

In [18]:
#take a look at token_list
token_list[:2]

[[328282,
  31364,
  132416,
  59904,
  476163,
  477213,
  58507,
  563614,
  73354,
  59906,
  472397,
  73354,
  16106,
  515323,
  2060,
  309637,
  314573,
  330322,
  5639,
  59906,
  309546,
  57954,
  128633,
  59906,
  210758,
  73354,
  433651,
  597326,
  115270,
  210788,
  92999,
  314573,
  476163,
  477213,
  208762,
  73354,
  92230,
  410564,
  310797,
  92999,
  314573,
  476163,
  204475,
  440773,
  150909,
  460135,
  214728,
  207139,
  44260,
  328282,
  476163,
  455007,
  203528,
  54936,
  424106,
  107953,
  314573,
  476163,
  59906,
  537292,
  424141,
  103703,
  73354,
  5639,
  59906,
  309546,
  424106,
  433651,
  333685,
  225526,
  132982,
  286952,
  54303,
  384682,
  424106,
  295582,
  286952,
  83661,
  132199,
  286952,
  248288,
  57511,
  59906,
  99886,
  73354,
  267368,
  328282,
  167135,
  286952,
  433651,
  492374,
  124449,
  424141,
  263343,
  394819,
  59906,
  589872,
  73354,
  477213,
  172327,
  43030,
  336823,
  288853,
  588

In [19]:
#testing one sentence
for tokens in token_list[0]:
    print(id2word[tokens])

anaxarchus
(;
;
)
was
a
greek
philosopher
of
the
school
of
democritus
together
with
pyrrho
he
accompanied
alexander
the
great
into
asia
the
reports
of
his
philosophical
views
suggest
that
he
was
a
forerunner
of
pyrrhonism
aelian
writes
that
he
was
called
eudaemonicus
or
"happy
man"
()
life
anaxarchus
was
born
at
abdera
in
thrace
he
was
the
companion
and
friend
of
alexander
the
great
in
his
asiatic
campaigns
according
to
diogenes
laërtius
in
response
to
alexander's
claim
to
have
been
the
son
of
zeusammon
anaxarchus
pointed
to
his
bleeding
wound
and
remarked
"see
the
blood
of
a
mortal
not
ichor
such
as
flows
from
the
veins
of
the
immortal
gods"
aelian
writes
that
anaxarchus
laughed
at
alexander
for
making
himself
a
god
and
said
"the
hopes
of
our
god
are
in
a
porringer
of
broth"
when
the
physician
prescribed
a
broth
to
alexander
plutarch
tells
a
story
that
at
bactra
in
327
bc
in
a
debate
with
callisthenes
he
advised
all
to
worship
alexander
as
a
god
even
during
his
lifetime
as
they
would


# 2. Data loader

We gonna make dataloader.  Inside here, we need to make two types of embeddings: **token embedding** and **segment embedding**

1. **Token embedding** - Given “The cat is walking. The dog is barking”, we add [CLS] and [SEP] >> “[CLS] the cat is walking [SEP] the dog is barking”. 

2. **Segment embedding**
A segment embedding separates two sentences, i.e., [0 0 0 0 1 1 1 1 ]

3. **Masking**
As mentioned in the original paper, BERT randomly assigns masks to 15% of the sequence. In this 15%, 80% is replaced with masks, while 10% is replaced with random tokens, and the rest 10% is left as is.  Here we specified `max_pred` 

4. **Padding**
Once we mask, we will add padding. For simplicity, here we padded until some specified `max_len`. 

Note:  `positive` and `negative` are just simply counts to keep track of the batch size.  `positive` refers to two sentences that are really next to one another.

In [None]:
batch_size = 3 
max_mask   = 5  # max masked tokens when 15% exceed, it will only be max_pred
max_len    = 1000 # maximum of length to be padded; 

In [21]:
def make_batch():
    batch = []
    half_batch_size = batch_size // 2
    positive = negative = 0
    while positive != half_batch_size or negative != half_batch_size:

        #randomly choose two sentence
        tokens_a_index, tokens_b_index = np.random.randint(len(sentences), size=2)
        tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]

        #1. token embedding - add CLS and SEP
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]

        #2. segment embedding - which sentence is 0 and 1
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]']
                                 and token != word2id['[SEP]']]
        np.random.shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []
        #simply loop and mask accordingly
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            rand_val = np.random.random()
            if rand_val < 0.1:  #10% replace with random token
                index = np.random.randint(4, vocab_size - 1)  # random token should not involve [PAD], [CLS], [SEP], [MASK]
                input_ids[pos] = word2id[id2word[index]]
            elif rand_val < 0.8:  #80 replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else:
                pass

        #4. pad the sentence to the max length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        #5. pad the mask tokens to the max length
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        #6. check whether is positive or negative
        if tokens_a_index + 1 == tokens_b_index and positive < half_batch_size:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < half_batch_size:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1

    return batch

In [22]:
batch = make_batch()

In [23]:
len(batch)

2

In [24]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [25]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext.shape

(torch.Size([2, 1000]),
 torch.Size([2, 1000]),
 torch.Size([2, 5]),
 torch.Size([2, 5]),
 torch.Size([2]))

# 3. Model

Recall that BERT only uses the encoder.

BERT has the following components:

- Embedding layers
- Attention Mask
- Encoder layer
- Multi-head attention
- Scaled dot product attention
- Position-wise feed-forward network
- BERT (assembling all the components)

## 3.1 Embedding

Here we simply generate the positional embedding, and sum the token embedding, positional embedding, and segment embedding together.

In [26]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, max_len, n_segments, d_model, device):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)
        self.device = device

    def forward(self, x, seg):
        #x, seg: (bs, len)
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long).to(self.device)
        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

## 3.2 Attention mask

In [27]:
def get_attn_pad_mask(seq_q, seq_k, device):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1).to(device)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

### Testing the attention mask

In [28]:
print(get_attn_pad_mask(input_ids, input_ids, device).shape)

torch.Size([2, 1000, 1000])


## 3.3 Encoder

The encoder has two main components: 

- Multi-head Attention
- Position-wise feed-forward network

First let's make the wrapper called `EncoderLayer`

In [29]:
class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, device):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(n_heads, d_model, d_k, device)
        self.pos_ffn       = PoswiseFeedForwardNet(d_model, d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

Let's define the scaled dot attention, to be used inside the multihead attention

In [30]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k, device):
        super(ScaledDotProductAttention, self).__init__()
        self.scale = torch.sqrt(torch.FloatTensor([d_k])).to(device)

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / self.scale # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn 

Let's define the parameters first

In [31]:
n_layers = 6    # number of Encoder of Encoder Layer
n_heads  = 8    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

Here is the Multiheadattention.

In [32]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, device):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_k
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, self.d_v * n_heads)
        self.device = device
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention(self.d_k, self.device)(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(self.n_heads * self.d_v, self.d_model, device=self.device)(context)
        return nn.LayerNorm(self.d_model, device=self.device)(output + residual), attn # output: [batch_size x len_q x d_model]

Here is the PoswiseFeedForwardNet.

In [33]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(F.gelu(self.fc1(x)))

## 3.4 Putting them together

In [34]:
class BERT(nn.Module):
    def __init__(self, n_layers, n_heads, d_model, d_ff, d_k, n_segments, vocab_size, max_len, device):
        super(BERT, self).__init__()
        self.params = {'n_layers': n_layers, 'n_heads': n_heads, 'd_model': d_model,
                       'd_ff': d_ff, 'd_k': d_k, 'n_segments': n_segments,
                       'vocab_size': vocab_size, 'max_len': max_len}
        self.embedding = Embedding(vocab_size, max_len, n_segments, d_model, device)
        self.layers = nn.ModuleList([EncoderLayer(n_heads, d_model, d_ff, d_k, device) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))
        self.device = device

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        
        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_nsp
    
    def get_last_hidden_state(self, input_ids, segment_ids):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        return output

## 4. Training

In [35]:
from tqdm.auto import tqdm

n_layers = 12    # number of Encoder of Encoder Layer
n_heads  = 12    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = d_model * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

num_epoch = 700
model = BERT(
    n_layers, 
    n_heads, 
    d_model, 
    d_ff, 
    d_k, 
    n_segments, 
    vocab_size, 
    max_len, 
    device
).to(device)  # Move model to GPU

In [36]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [37]:
batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

# Move inputs to GPU
input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
masked_tokens = masked_tokens.to(device)
masked_pos = masked_pos.to(device)
isNext = isNext.to(device)

# Wrap the epoch loop with tqdm
for epoch in tqdm(range(num_epoch), desc="Training Epochs"):
    optimizer.zero_grad()
    logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)    
    #logits_lm: (bs, max_mask, vocab_size) ==> (6, 5, 34)
    #logits_nsp: (bs, yes/no) ==> (6, 2)

    #1. mlm loss
    #logits_lm.transpose: (bs, vocab_size, max_mask) vs. masked_tokens: (bs, max_mask)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    #2. nsp loss
    #logits_nsp: (bs, 2) vs. isNext: (bs, )
    loss_nsp = criterion(logits_nsp, isNext) # for sentence classification
    
    #3. combine loss
    loss = loss_lm + loss_nsp
    if epoch % 100 == 0:
        print('Epoch:', '%02d' % (epoch), 'loss =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

Training Epochs:   0%|          | 0/700 [00:00<?, ?it/s]

Epoch: 00 loss = 132.776276


Training Epochs:  14%|█▍        | 101/700 [14:03<1:22:10,  8.23s/it]

Epoch: 100 loss = 3.888227


Training Epochs:  29%|██▊       | 201/700 [27:52<1:09:22,  8.34s/it]

Epoch: 200 loss = 3.263463


Training Epochs:  43%|████▎     | 301/700 [41:37<55:22,  8.33s/it]  

Epoch: 300 loss = 3.452553


Training Epochs:  57%|█████▋    | 401/700 [55:30<41:35,  8.35s/it]

Epoch: 400 loss = 3.065888


Training Epochs:  72%|███████▏  | 501/700 [1:09:19<27:32,  8.31s/it]

Epoch: 500 loss = 3.188833


Training Epochs:  86%|████████▌ | 601/700 [1:23:10<13:58,  8.47s/it]

Epoch: 600 loss = 3.055272


Training Epochs: 100%|██████████| 700/700 [1:36:52<00:00,  8.30s/it]


In [41]:
# Save the model after training
torch.save([model.params, model.state_dict()], 'model/model_bert.pth')
print("Model saved to model_bert.pth")

Model saved to model_bert.pth


# 5. Inference

Since our dataset is very small, it won't work very well, but just for the sake of demonstration.

In [42]:
# load the model and all its hyperparameters
params, state = torch.load('model/model_bert.pth')
model_bert = BERT(**params, device=device).to(device)
model_bert.load_state_dict(state)

<All keys matched successfully>

In [50]:
# print shape of batch
batch

[[[1,
   416144,
   63440,
   73354,
   107460,
   424141,
   18136,
   90369,
   150909,
   582273,
   59904,
   116645,
   477213,
   327895,
   63440,
   517581,
   424106,
   416144,
   43606,
   552187,
   116645,
   487361,
   82065,
   367333,
   59906,
   366721,
   73354,
   321807,
   366721,
   73354,
   424008,
   424141,
   316506,
   171086,
   284423,
   36366,
   528650,
   424141,
   5979,
   461576,
   409019,
   403148,
   59906,
   199817,
   286785,
   426125,
   63440,
   357938,
   59906,
   583138,
   547272,
   73354,
   61529,
   273526,
   328517,
   478877,
   428683,
   171878,
   424141,
   438972,
   537995,
   84324,
   377890,
   160702,
   286952,
   550615,
   63440,
   424141,
   93066,
   63440,
   424106,
   59906,
   60564,
   153975,
   424141,
   57651,
   343551,
   136842,
   63440,
   424106,
   59906,
   60564,
   324572,
   59906,
   63440,
   576831,
   576204,
   424106,
   59906,
   345747,
   73354,
   18136,
   340698,
   424141,
   34

In [51]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[1]))
print([id2word[w.item()] for w in input_ids[0] if id2word[w.item()] != '[PAD]'])
input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
masked_tokens = masked_tokens.to(device)
masked_pos = masked_pos.to(device)
isNext = isNext.to(device)

logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)
#logits_lm:  (1, max_mask, vocab_size) ==> (1, 5, 34)
#logits_nsp: (1, yes/no) ==> (1, 2)

#predict masked tokens
#max the probability along the vocab dim (2), [1] is the indices of the max, and [0] is the first value
logits_lm = logits_lm.data.cpu().max(2)[1][0].data.numpy() 
#note that zero is padding we add to the masked_tokens
print('masked tokens (words) : ',[id2word[pos.item()] for pos in masked_tokens[0]])
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0]])
print('masked tokens (words) : ',[id2word[pos.item()] for pos in logits_lm])
print('predict masked tokens list : ', [pos for pos in logits_lm])

#predict nsp
logits_nsp = logits_nsp.cpu().data.max(1)[1][0].data.numpy()
print(logits_nsp)
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_nsp else False)

['[CLS]', 'christian', 'enriquez', '(born', 'october', '25', '1998)', 'is', 'an', 'american', 'soccer', 'player', 'who', 'currently', 'plays', 'for', '(1996–1999', 'madison', 'in', 'the', 'usl', 'league', 'one', 'career', 'youth', 'enriquez', 'played', 'high', 'school', 'soccer', 'at', 'helix', 'high', 'school', 'and', 'club', 'soccer', 'for', 'ussda', 'side', 'nomads', 'sc', 'who', 'he', 'helped', 'to', 'us', 'youth', 'soccer', 'national', 'championships', 'in', 'both', '2013', 'and', '2014', 'college', 'in', '2016', 'enriquez', 'attended', 'california', 'polytechnic', 'state', 'university', 'to', 'play', 'college', 'soccer', 'in', 'two', 'seasons', 'with', 'the', 'mustangs', 'enriquez', 'made', '33', 'appearances', 'scoring', 'a', 'single', 'goal', 'and', 'tallying', '2', 'assists', 'in', 'january', '2018', 'enriquez', 'announced', 'he', 'would', 'leave', 'college', 'early', 'to', 'pursue', 'a', 'professional', 'career', 'professional', 'in', 'february', '2018', 'enriquez', 'signed',

> `Task-2` and `Task-3` are carried out in a seperate file.