## Imports

In [1]:
import random
random.seed(10)

In [2]:
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math
import re



from random import *

## Loading Text Data

In [3]:
# Load the text data
text = open('text.txt', 'r').read()

## Text Data Pre-Processing and Vocabulary Construction

In [4]:
# filtering special characters: '.', ',', '?', '!'
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') 

# Breaking sentences into words and creating a word list
word_list = list(set(" ".join(sentences).split()))

# Initialize the word dictionary with BERT's special tokens
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}


# We include the words in the dictionary and create indexes
for i, w in enumerate(word_list):
    word_dict[w] = i + 4
    

# Inverting the order and placing the indexes as key and the words as value in the dictionary
number_dict = {i: w for i, w in enumerate(word_dict)}

# Vocabulary size
vocab_size = len(word_dict)

# Creating a list for tokens
token_list = list()


# Loop through the sentences to create the list of tokens
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

## Hyperparameter Definition

In [5]:
# Hyperparameters
batch_size = 6
n_segments = 2
dropout = 0.2

# Maximum lenght
maxlen = 100 

# Maximum number of tokens that will be predicted
max_pred = 7

# Number of layers
n_layers = 6 

# Number of heads in multi-head attention
n_heads = 12

# Embedding size
d_model = 768

# Feedforward dimension size: 4 * d_model
d_ff = d_model * 4

# Dimension of K(=Q)V
d_k = d_v = 64 

# Epochs
NUM_EPOCHS = 50

## Creating Data Batches and Applying Special Tokens

In [6]:
# Defines the function to create batches of data
def make_batch():
    
    # Initialize the batch as an empty list
    batch = []
    
    # Initialize counters for positive and negative examples
    positive = negative = 0
    
    # Continue until half of the batch is positive examples and half is negative examples
    while positive != batch_size/2 or negative != batch_size/2:
        
        # Choose random indices for two sentences
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        
        # Retrieves the tokens corresponding to the indexes
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        
        # Prepare input ids by adding special tokens [CLS] and [SEP]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        
        # Set the segment ids to differentiate the two sentences
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        # Calculate the number of predictions to make (15% of tokens)
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15))))
        
        # Identifies candidate positions for masking that are not [CLS] or [SEP]
        cand_maked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        
        # Shuffles the candidate positions
        shuffle(cand_maked_pos)
        
        # Initialize lists for masked tokens and their positions
        masked_tokens, masked_pos = [], []
        
        # Mask tokens until you reach the desired number of predictions
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            
            # Random mask
            if random() < 0.8:  
                input_ids[pos] = word_dict['[MASK]'] 
            
            # Replace with another token 10% of the time (20% of the remaining time)
            elif random() < 0.5:  
                index = randint(0, vocab_size - 1) 
                input_ids[pos] = word_dict[number_dict[index]] 
        
        # Add zero padding to input ids and segment ids to reach maximum length
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        # Add zero padding to the masked tokens and their positions if necessary
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        # Add to the batch as a positive example if the sentences are consecutive
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) 
            positive += 1
        
        # Add to the batch as a negative example if the sentences are not consecutive
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) 
            negative += 1
    
    # Returns  complete batch
    return batch

In [7]:
# Function for padding
def get_attn_pad_masked(seq_q, seq_k):
    
    batch_size, len_q = seq_q.size()
    
    batch_size, len_k = seq_k.size()
    
    pad_attn_masked = seq_k.data.eq(0).unsqueeze(1)
    
    return pad_attn_masked.expand(batch_size, len_q, len_k)

In [8]:
# Create a batch
batch = make_batch()

# Extract the elements from the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

# Applies the padding function
get_attn_pad_masked(input_ids, input_ids)[0][0], input_ids[0]

(tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]),
 tensor([ 1, 40, 48, 20, 14,  3,  2,  3, 58, 41, 20, 38, 15,  2,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  

## Model Building

In [9]:
# GeLu activation function
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

- Embedding Module

In [11]:
# Embedding Class
class Embedding(nn.Module):
    
    # Constructor method
    def __init__(self):
        
        super(Embedding, self).__init__()
        
        # Token embedding
        self.tok_embed = nn.Embedding(vocab_size, d_model)  
        
        # Position embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  
        
        # Segment (token type) embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  
        
        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

    # Forward Method
    def forward(self, x, seg):
        
        seq_len = x.size(1)
        
        pos = torch.arange(seq_len, dtype = torch.long)
        
        # (seq_len,) -> (batch_size, seq_len)
        pos = pos.unsqueeze(0).expand_as(x)  
        
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        
        return self.norm(embedding)

- Scaled Dot Product Attention Module

In [12]:
# Defines the class to perform normalized dot product attention
class ScaledDotProductAttention(nn.Module):
    
    # Constructor method
    def __init__(self):
        
        # Initialize the base class
        super(ScaledDotProductAttention, self).__init__()

    # Forward method to define the forward passage of data
    def forward(self, Q, K, V, attn_mask):
        
        # Compute attention scores as the product of Q and K, and normalize by key size
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        
        # Apply attention mask to avoid attention to certain tokens
        scores.masked_fill_(attn_mask, -1e9)
        
        # Apply softmax to obtain normalized attention weights
        attn = nn.Softmax(dim = -1)(scores)
        
        # Multiply the attention weights by V to get the context
        context = torch.matmul(attn, V)
        
        # Returns the context and attention weights
        return context, attn

- Multi-Head Attention Module

In [13]:
# Defines the class to perform multi-head attention
class MultiHeadAttention(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(MultiHeadAttention, self).__init__()
        
        # Define the weight matrix for the Q queries
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for K keys
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for the V values
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    # Forward method to define the forward passage of data
    def forward(self, Q, K, V, attn_mask):
        
        # Save the input Q for use in the residual and get the batch size
        residual, batch_size = Q, Q.size(0)
        
        # Processes Q through the W_Q and organizes the result to have [n_heads] in the second dimension
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        
        # Processes K through W_K and arranges the result to have [n_heads] in the second dimension
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        
        # Process V through W_V and arrange the result to have [n_heads] in the second dimension
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        
        # Adapt attn_mask to be compatible with the dimensions of q_s, k_s, v_s
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        
        # Compute the scaled attention from the dot product and context for each attention head
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        
        # Rearrange the context to match the attention heads and return to the original format
        context = context.transpose(1,2).contiguous().view(batch_size, -1, n_heads * d_v)
        
        # Applies a linear transformation to the combined context
        output = nn.Linear(n_heads * d_v, d_model)(context)
        
        # Normalize the output layer and add the residual
        return nn.LayerNorm(d_model)(output + residual), attn

In [14]:
# Create the Embedding object
emb = Embedding()

# Generate the Embeddings
embeds = emb(input_ids, segment_ids)

# Generates the attention mask
attenM = get_attn_pad_masked(input_ids, input_ids)

# Generates MultiHeadAttention
MHA = MultiHeadAttention()(embeds, embeds, embeds, attenM)

# Output
output, A = MHA

- Positional Feedforward Module

In [15]:
# Defines a class for the Positional Feed Forward network
class PoswiseFeedForward(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(PoswiseFeedForward, self).__init__()
        
        # First linear layer that increases the dimension of the data from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        
        # Second linear layer that reduces the dimension back from d_ff to d_model
        self.fc2 = nn.Linear(d_ff, d_model)

    # Forward method to define the forward passage of data
    def forward(self, x):
        
        # Applies the first linear transformation, followed by the GELU activation function
        # and then the second linear transformation
        return self.fc2(gelu(self.fc1(x)))

- Encoder Layer Module

In [16]:
# Defines the class for the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(EncoderLayer, self).__init__()
        
        # Instantiate multi-head attention for encoder self-attention
        self.enc_self_attn = MultiHeadAttention()
        
        # Instantiate the Positional Feed Forward network for use after self-attention
        self.pos_ffn = PoswiseFeedForward()

    # Forward method to define the forward passage of data
    def forward(self, enc_inputs, enc_self_attn_mask):
        
        # Apply self-attention to input data
        enc_inputs, atnn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        
        # After self-attention, pass the result through the Positional Feed Forward network
        enc_inputs = self.pos_ffn(enc_inputs)
        
        # Returns the encoder output and attention weights
        return enc_inputs, atnn

- Final LLM Architecture (BERT Model)

In [19]:
# BERT Model
class BERT(nn.Module):
    
    def __init__(self) -> None:
        
        super(BERT, self).__init__()
        
        self.embedding = Embedding()
        
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
        self.fc = nn.Linear(d_model, d_model)
        
        self.activ1 = nn.Tanh()
        
        self.linear = nn.Linear(d_model, d_model)
        
        self.activ2 = gelu
        
        self.norm = nn.LayerNorm(d_model)
        
        self.classifier = nn.Linear(d_model, 2)
        
        embed_weight = self.embedding.tok_embed.weight
        
        n_vocab, n_dim = embed_weight.size()
        
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        
        self.decoder.weight = embed_weight
        
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        
        output = self.embedding(input_ids, segment_ids)
        
        enc_self_attn_mask = get_attn_pad_masked(input_ids, input_ids)
        
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        
        h_pooled = self.activ1(self.fc(output[:, 0]))
        
        logits_clsf = self.classifier(h_pooled)
        
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        
        h_masked = torch.gather(output, 1, masked_pos)
        
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        
        logits_lm = self.decoder(h_masked) + self.decoder_bias
        
        return logits_lm, logits_clsf

## LLM Training and Assessment

In [20]:
# Create the model
BERT_model = BERT()

# Error function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(BERT_model.parameters(), lr = 0.001)

batch = make_batch()

input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [26]:
%%time

# Start the training loop for a defined number of epochs
for epoch in range(NUM_EPOCHS):
    
    # Resets the optimizer's gradients to avoid accumulation of gradients from previous epochs
    optimizer.zero_grad()
    
    # Pass the input data through the model and get the logits for language masking
    # and next sentence classification
    logits_lm, logits_clsf = BERT_model(input_ids, segment_ids, masked_pos)
    
    # Compute the loss for the language masking task by comparing the predicted logits
    # with the real tokens
    loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)
    
    # Calculate the average loss to normalize
    loss_lm = (loss_lm.float()).mean()
    
    # Compute the loss for the next sentence classification task
    loss_clsf = criterion(logits_clsf, isNext)
    
    # Add the losses of the two tasks to get the total loss
    loss = loss_lm + loss_clsf
    
    # Displays the current epoch and total loss
    print(f'Epoch: {epoch + 1} | Loss {loss:.4f}')
    
    # Perform backpropagation to calculate gradients
    loss.backward()
    
    # Update model parameters based on calculated gradients
    optimizer.step()

Epoch: 1 | Loss 179.2194
Epoch: 2 | Loss 96.9166
Epoch: 3 | Loss 37.0568
Epoch: 4 | Loss 35.6827
Epoch: 5 | Loss 23.2403
Epoch: 6 | Loss 38.5379
Epoch: 7 | Loss 11.5364
Epoch: 8 | Loss 12.9342
Epoch: 9 | Loss 15.1293
Epoch: 10 | Loss 14.0227
Epoch: 11 | Loss 15.9309
Epoch: 12 | Loss 18.2996
Epoch: 13 | Loss 19.4286
Epoch: 14 | Loss 17.5060
Epoch: 15 | Loss 16.3476
Epoch: 16 | Loss 25.5010
Epoch: 17 | Loss 18.4387
Epoch: 18 | Loss 14.7126
Epoch: 19 | Loss 16.3979
Epoch: 20 | Loss 18.8037
Epoch: 21 | Loss 16.5121
Epoch: 22 | Loss 14.0458
Epoch: 23 | Loss 13.3265
Epoch: 24 | Loss 13.8145
Epoch: 25 | Loss 14.9996
Epoch: 26 | Loss 12.8051
Epoch: 27 | Loss 14.5161
Epoch: 28 | Loss 12.6278
Epoch: 29 | Loss 13.2863
Epoch: 30 | Loss 11.7706
Epoch: 31 | Loss 11.8715
Epoch: 32 | Loss 11.8745
Epoch: 33 | Loss 11.6531
Epoch: 34 | Loss 10.8835
Epoch: 35 | Loss 10.0651
Epoch: 36 | Loss 10.0002
Epoch: 37 | Loss 10.4605
Epoch: 38 | Loss 9.3846
Epoch: 39 | Loss 9.4614
Epoch: 40 | Loss 9.5516
Epoch: 41 |

## Extracting Predictions from the Trained LLM

In [29]:
# Extract the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))

In [52]:
# Extract token predictions
logits_lm, logits_clsf = BERT_model(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('List of Real Masked Tokens: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('List of Predicted Masked Tokens: ', [pos for pos in logits_lm if pos != 0])

List of Real Masked Tokens:  [35, 22]
List of Predicted Masked Tokens:  [35, 35]


- Remember. The goal is not to have good accuracy.

In [53]:
# Extract next token predictions
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext (Real value): ', True if isNext else False)
print('isNext (Expected Value): ', True if logits_clsf else False)

isNext (Real value):  False
isNext (Expected Value):  False
