In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')
import random
random.seed(10)
import re
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import *
from IPython.display import Image, display

file_path = '/content/drive/MyDrive/text.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

type(text_data)
print(text_data)

print("\n")

sentences = re.sub("[.,!?\\-]", '', text_data.lower()).split('\n')
word_list = list(set(" ".join(sentences).split()))
print(word_list)
print("---------------------------------------")


word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
print(word_dict)

print("---------------------------------------")


for i, w in enumerate(word_list):
    word_dict[w] = i + 4
print(word_dict)
print("---------------------------------------")

number_dict = {i: w for i, w in enumerate(word_dict)}
print(number_dict)

vocab_size = len(word_dict)
print(vocab_size)

print("---------------------------------------")
token_list = list()

for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)
print(token_list)
print("---------------------------------------")

text_data[0:42]
token_list[0]



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
"Hello, how are you? I am Carol.\n"
"Hello, Carol, my name is Frank. Nice to meet you.\n"
"Nice to meet you too. How are you today?\n"
"Great. My football team won the competition.\n"
"Wow, congratulations Frank!\n"
"Thank you Carol.\n"
"Shall we have a pizza later to celebrate?\n"
"Sure. Do you recommend any restaurant, Carol?\n"
"Yes, a new restaurant opened, and they say the banana pizza is phenomenal.\n"
"Okay. Let's meet at the restaurant at seven PM, is that okay?\n"
"That's fine. See you later then."


['restaurant', 'congratulations', 'is', '"wow', 'and', 'say', 'meet', 'how', '"sure', 'am', 'you', 'then"', 'won', 'to', 'frank', 'opened', 'seven', '"yes', '"nice', 'name', 'too', 'i', 'a', 'any', 'at', 'do', 'pizza', "let's", 'carol\\n"', 'are', 'competition\\n"', '"thank', 'phenomenal\\n"', '"okay', 'the', 'banana', 'okay\\n"', '"that\'s', 'new', 'tha

[53, 11, 33, 14, 25, 13, 32]

In [3]:
# 24. Hyperparameters
batch_size = 10
n_segments = 2
dropout = 0.2

# Maximum length
maxlen = 100

# Maximum number of tokens to be predicted
max_pred = 7

# Number of layers
n_layers = 8

# Number of heads in multi-head attention
n_heads = 12

# Embedding size
d_model = 768

# Feedforward dimension size: 4 * d_model
d_ff = d_model * 4

# Dimension of K(=Q)V
d_k = d_v = 64

# Epochs
NUM_EPOCHS = 50

In [4]:
# 25. Define the function to create data batches
def make_batch():

    # Initialize the batch as an empty list
    batch = []

    # Initialize counters for positive and negative examples
    positive = negative = 0

    # Continue until half of the batch is positive examples and the other half is negative examples
    while positive != batch_size / 2 or negative != batch_size / 2:

        # Randomly select indices for two sentences
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))

        # Retrieve the tokens corresponding to the indices
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]

        # Prepare the input ids by adding special tokens [CLS] and [SEP]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

        # Define the segment ids to differentiate the two sentences
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # Calculate the number of predictions to be made (15% of tokens)
        n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15))))

        # Identify candidate positions for masking that are not [CLS] or [SEP]
        cand_masked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]

        # Shuffle the candidate positions
        shuffle(cand_masked_pos)

        # Initialize lists for masked tokens and their positions
        masked_tokens, masked_pos = [], []

        # Mask tokens until the desired number of predictions is reached
        for pos in cand_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])

            # Random mask
            if random() < 0.8:
                input_ids[pos] = word_dict['[MASK]']

            # Replace with another token 10% of the time (20% of the remaining time)
            elif random() < 0.5:
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word_dict[number_dict[index]]

        # Add zero padding to the input ids and segment ids to reach the maximum length
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Add zero padding to the masked tokens and their positions if needed
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        # Add to the batch as a positive example if the sentences are consecutive
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1

        # Add to the batch as a negative example if the sentences are not consecutive
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1

    # Return the complete batch
    return batch

In [5]:
# 26. Function for padding
def get_attn_pad_masked(seq_q, seq_k):

    batch_size, len_q = seq_q.size()

    batch_size, len_k = seq_k.size()

    pad_attn_masked = seq_k.data.eq(0).unsqueeze(1)

    return pad_attn_masked.expand(batch_size, len_q, len_k)
# 27. Create a batch
batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))
input_ids
print("?????????????")
input_ids[0]

?????????????


tensor([ 1, 37, 31, 10, 28, 38,  3, 28, 20, 48,  6, 43,  3,  2, 53, 11, 33, 14,
        25, 13, 32,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [6]:
masked_tokens[0]

tensor([28, 40,  4,  0,  0,  0,  0])

In [7]:
masked_pos[0]

tensor([ 4, 12,  6,  0,  0,  0,  0])

In [8]:
isNext[0]

tensor(0)

In [9]:
get_attn_pad_masked(input_ids, input_ids)[0][0], input_ids[0]

(tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]),
 tensor([ 1, 37, 31, 10, 28, 38,  3, 28, 20, 48,  6, 43,  3,  2, 53, 11, 33, 14,
         25, 13, 32,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  

In [10]:
# 35. GeLU activation function
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [11]:
# 36. Embedding Class
class Embedding(nn.Module):

    # Constructor method
    def __init__(self):

        super(Embedding, self).__init__()

        # Token embedding
        self.tok_embed = nn.Embedding(vocab_size, d_model)

        # Position embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)

        # Segment (token type) embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)

        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

    # Forward method
    def forward(self, x, seg):

        seq_len = x.size(1)

        pos = torch.arange(seq_len, dtype=torch.long)

        # (seq_len,) -> (batch_size, seq_len)
        pos = pos.unsqueeze(0).expand_as(x)

        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)

        return self.norm(embedding)

In [12]:
# 37. Define the class to perform Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):

    # Initialization method for the class
    def __init__(self):

        # Initialize the base class
        super(ScaledDotProductAttention, self).__init__()

    # Forward method to define the data pass through
    def forward(self, Q, K, V, attn_mask):

        # Calculate the attention scores as the dot product of Q and K, and normalize by the key size
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)

        # Apply the attention mask to prevent attention to certain tokens
        scores.masked_fill_(attn_mask, -1e9)

        # Apply softmax to get normalized attention weights
        attn = nn.Softmax(dim=-1)(scores)

        # Multiply the attention weights by V to get the context
        context = torch.matmul(attn, V)

        # Return the context and the attention weights
        return context, attn

In [13]:
# 38. Define the class to perform multi-head attention
class MultiHeadAttention(nn.Module):

    def __init__(self) -> None:

        # Initialize the base class
        super(MultiHeadAttention, self).__init__()

        # Define the weight matrix for the queries Q
        self.W_Q = nn.Linear(d_model, d_k * n_heads)

        # Define the weight matrix for the keys K
        self.W_K = nn.Linear(d_model, d_k * n_heads)

        # Define the weight matrix for the values V
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    # Forward method to define the data pass through
    def forward(self, Q, K, V, attn_mask):

        # Save the input Q for the residual and get the batch size
        residual, batch_size = Q, Q.size(0)

        # Process Q through W_Q and reshape the result to have [n_heads] in the second dimension
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)

        # Process K through W_K and reshape the result to have [n_heads] in the second dimension
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)

        # Process V through W_V and reshape the result to have [n_heads] in the second dimension
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)

        # Adapt attn_mask to be compatible with the dimensions of q_s, k_s, v_s
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

        # Calculate the scaled dot-product attention and context for each attention head
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)

        # Reshape the context to combine the attention heads and return to the original format
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)

        # Apply a linear transformation to the combined context
        output = nn.Linear(n_heads * d_v, d_model)(context)

        # Normalize the output layer and add the residual
        return nn.LayerNorm(d_model)(output + residual), attn

In [14]:
emb = Embedding()
embeds = emb(input_ids, segment_ids)
attenM = get_attn_pad_masked(input_ids, input_ids)
MHA = MultiHeadAttention()(embeds, embeds, embeds, attenM)
output, A = MHA
A[0][0]

tensor([[0.0525, 0.0465, 0.0283,  ..., 0.0000, 0.0000, 0.0000],
        [0.0427, 0.0414, 0.0452,  ..., 0.0000, 0.0000, 0.0000],
        [0.0507, 0.0469, 0.0286,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0412, 0.0487, 0.0704,  ..., 0.0000, 0.0000, 0.0000],
        [0.0496, 0.0552, 0.0680,  ..., 0.0000, 0.0000, 0.0000],
        [0.0397, 0.0436, 0.0572,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<SelectBackward0>)

In [15]:
# 45. Define the class for the Positional Feed Forward Network
class PoswiseFeedForward(nn.Module):

    def __init__(self) -> None:

        # Initialize the base class
        super(PoswiseFeedForward, self).__init__()

        # First linear layer that increases the data dimension from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)

        # Second linear layer that reduces the dimension back from d_ff to d_model
        self.fc2 = nn.Linear(d_ff, d_model)

    # Forward method to define the data pass through
    def forward(self, x):

        # Apply the first linear transformation, followed by the GELU activation function,
        # and then the second linear transformation
        return self.fc2(gelu(self.fc1(x)))

In [16]:
# 46. Define the class for the encoder layer
class EncoderLayer(nn.Module):

    def __init__(self) -> None:

        # Initialize the base class
        super(EncoderLayer, self).__init__()

        # Instantiate multi-head attention for the encoder self-attention
        self.enc_self_attn = MultiHeadAttention()

        # Instantiate the Positional Feed Forward network to use after self-attention
        self.pos_ffn = PoswiseFeedForward()

    # Forward method to define the data pass through
    def forward(self, enc_inputs, enc_self_attn_mask):

        # Apply self-attention to the input data
        enc_inputs, atnn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)

        # After self-attention, pass the result through the Positional Feed Forward network
        enc_inputs = self.pos_ffn(enc_inputs)

        # Return the encoder output and attention weights
        return enc_inputs, atnn

In [17]:
# 47. BERT Model
class BERT(nn.Module):  # Define the BERT model

    def __init__(self) -> None:
        # Initialize the base class nn.Module
        super(BERT, self).__init__()

        # Embedding module for tokens, positions, and segments
        self.embedding = Embedding()

        # Encoder layers using the EncoderLayer class
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

        # Fully connected layer for linear transformation
        self.fc = nn.Linear(d_model, d_model)

        # Activation function Tanh
        self.activ1 = nn.Tanh()

        # Additional linear layer
        self.linear = nn.Linear(d_model, d_model)

        # Activation function GeLU
        self.activ2 = gelu

        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

        # Final classification layer
        self.classifier = nn.Linear(d_model, 2)

        # Initialize the decoder and share weights with token embeddings
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight

        # Initialize decoder bias
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        # Generate embeddings for input tokens and segments
        output = self.embedding(input_ids, segment_ids)

        # Generate attention mask to handle padding tokens
        enc_self_attn_mask = get_attn_pad_masked(input_ids, input_ids)

        # Pass embeddings through encoder layers
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        # Apply pooling to the [CLS] token representation
        h_pooled = self.activ1(self.fc(output[:, 0]))

        # Generate classification logits
        logits_clsf = self.classifier(h_pooled)

        # Expand masked positions for attention
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))

        # Gather masked token representations
        h_masked = torch.gather(output, 1, masked_pos)

        # Apply activation and normalization to the masked tokens
        h_masked = self.norm(self.activ2(self.linear(h_masked)))

        # Decode masked token representations to logits
        logits_lm = self.decoder(h_masked) + self.decoder_bias

        # Return logits for masked language modeling and classification
        return logits_lm, logits_clsf


In [18]:
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [19]:
%%time

# Start the training loop for a defined number of epochs
for epoch in range(NUM_EPOCHS):

    # Zero the gradients of the optimizer to avoid accumulation from previous epochs
    optimizer.zero_grad()

    # Pass the input data through the model and get the logits for language masking
    # and next sentence classification
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)

    # Calculate the loss for the language modeling task by comparing the predicted logits
    # with the true tokens
    loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)

    # Compute the mean of the loss for normalization
    loss_lm = (loss_lm.float()).mean()

    # Calculate the loss for the next sentence classification task
    loss_clsf = criterion(logits_clsf, isNext)

    # Sum the losses from both tasks to get the total loss
    loss = loss_lm + loss_clsf

    # Display the current epoch and total loss
    print(f'Epoch: {epoch + 1} | Loss {loss:.4f}')

    # Perform backpropagation to compute the gradients
    loss.backward()

    # Update the model parameters based on the calculated gradients
    optimizer.step()

Epoch: 1 | Loss 84.7126
Epoch: 2 | Loss 122.1345
Epoch: 3 | Loss 284.4071
Epoch: 4 | Loss 73.3592
Epoch: 5 | Loss 195.4119
Epoch: 6 | Loss 173.6462
Epoch: 7 | Loss 148.2958
Epoch: 8 | Loss 115.4070
Epoch: 9 | Loss 87.5349
Epoch: 10 | Loss 67.6660
Epoch: 11 | Loss 46.9813
Epoch: 12 | Loss 29.2772
Epoch: 13 | Loss 25.2530
Epoch: 14 | Loss 29.2006
Epoch: 15 | Loss 30.7385
Epoch: 16 | Loss 30.0904
Epoch: 17 | Loss 29.1983
Epoch: 18 | Loss 26.2729
Epoch: 19 | Loss 23.9714
Epoch: 20 | Loss 20.8577
Epoch: 21 | Loss 20.2223
Epoch: 22 | Loss 18.7058
Epoch: 23 | Loss 18.3251
Epoch: 24 | Loss 17.2492
Epoch: 25 | Loss 17.1634
Epoch: 26 | Loss 15.8550
Epoch: 27 | Loss 15.4178
Epoch: 28 | Loss 14.7199
Epoch: 29 | Loss 13.8990
Epoch: 30 | Loss 15.1159
Epoch: 31 | Loss 13.2404
Epoch: 32 | Loss 11.9363
Epoch: 33 | Loss 11.9737
Epoch: 34 | Loss 11.6567
Epoch: 35 | Loss 11.0806
Epoch: 36 | Loss 10.0341
Epoch: 37 | Loss 9.6127
Epoch: 38 | Loss 8.8871
Epoch: 39 | Loss 8.1047
Epoch: 40 | Loss 7.5915
Epoch: 

In [20]:
# 53. Extract the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
print(text_data)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])
# 54. Extract predictions of the tokens
logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('List of Real Masked Tokens: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('List of Predicted Masked Tokens: ', [pos for pos in logits_lm if pos != 0])
# 55. Extract predictions of the next token
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext (True Value): ', True if isNext else False)
print('isNext (Predicted Value): ', True if logits_clsf else False)

"Hello, how are you? I am Carol.\n"
"Hello, Carol, my name is Frank. Nice to meet you.\n"
"Nice to meet you too. How are you today?\n"
"Great. My football team won the competition.\n"
"Wow, congratulations Frank!\n"
"Thank you Carol.\n"
"Shall we have a pizza later to celebrate?\n"
"Sure. Do you recommend any restaurant, Carol?\n"
"Yes, a new restaurant opened, and they say the banana pizza is phenomenal.\n"
"Okay. Let's meet at the restaurant at seven PM, is that okay?\n"
"That's fine. See you later then."
['[CLS]', '[MASK]', 'a', 'new', 'restaurant', 'opened', 'and', 'they', 'say', 'the', 'banana', 'pizza', 'is', 'phenomenal\\n"', '[SEP]', '"hello', 'carol', 'my', 'name', 'is', '[MASK]', 'nice', 'to', 'meet', '[MASK]', '[SEP]']
List of Real Masked Tokens:  [52, 39, 18, 21]
List of Predicted Masked Tokens:  []
isNext (True Value):  False
isNext (Predicted Value):  True
