## Implementing a Custom BERT Model for Spam Prediction

### Import Libraries

In [2]:
%pip install torch==1.11

^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
import time
import numpy as np
import pandas as pd
import json

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

import transformers
from transformers import AutoModel, BertTokenizerFast

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'pandas'

#### Define GPU Here if Available

In [None]:
device = torch.device("cuda")
# device = torch.device("cpu")

: 

#### Check if the GPU is in Use

In [None]:
!nvidia-smi

: 

### Preprocessing

#### Import the Corpus of Raw Text

In [None]:
# Use this when running the notebook locally
data = pd.read_csv(r"./assets/data/spam-data.csv")

# Running the notebook on Kaggle
# data = pd.read_csv(r"/kaggle/input/spamdatatest/spamdata_v2.csv")

data.head()

: 

### Check the Shape of Data

In [None]:
data.shape

: 

#### Check the Way Labels are Distributed

In [None]:
data['label'].value_counts(normalize=True)

: 

#### Plot the Bar Plot for the Distribution

In [None]:
data['label'].value_counts(normalize=True).plot.bar()

: 

### Split the Data into Training, Testing and Validation Sets

#### Train-test Split

In [None]:
XTrain, XTest, yTrain, yTest = train_test_split(data['text'], data['label'], random_state=42, test_size=0.3, stratify=data['label'])

: 

#### Validation Split

In [None]:
XValidationTrain, XValidationTest, yValidationTrain, yValidationTest = train_test_split(
    XTest, 
    yTest, 
    random_state=42, 
    test_size=0.5, 
    stratify=yTest
)

: 

### Download and Import the Pre-trained BERT Model from Huggingface

In [None]:
# Import the BERT-base pretrained model
BERT = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

: 

### Get the Length of each Sequence of Text and Plot their Distributions

In [None]:
# get length of all the messages in the train set
sequenceLength = [len(sample.split()) for sample in XTrain]

pd.Series(sequenceLength).hist(bins = 100)

: 

### Tokenise and Encode Sequences

In [None]:
# Tokenize and encode sequences in the training set
trainTokens = tokenizer.batch_encode_plus(
    XTrain.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# Tokenize and encode sequences in the validation set
validationTokens = tokenizer.batch_encode_plus(
    XValidationTrain.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# Tokenize and encode sequences in the test set
testTokens = tokenizer.batch_encode_plus(
    XValidationTest.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

: 

In [None]:
type(trainTokens)

: 

### Convert these Lists to Tensors

In [None]:
trainSequenceTensor = torch.tensor(trainTokens['input_ids'])
trainMaskTensor = torch.tensor(trainTokens['attention_mask'])
trainYTensor = torch.tensor(yTrain.tolist())

validationSequenceTensor = torch.tensor(validationTokens['input_ids'])
validationMaskTensor = torch.tensor(validationTokens['attention_mask'])
validationYTensor = torch.tensor(yValidationTrain.tolist())

testSequenceTensor = torch.tensor(testTokens['input_ids'])
testMaskTensor = torch.tensor(testTokens['attention_mask'])
testYTensor = torch.tensor(yValidationTest.tolist())

: 

#### Here're the Created Tensors

In [None]:
trainSequenceTensor, trainSequenceTensor.shape

: 

In [None]:
testSequenceTensor, testSequenceTensor.shape

: 

In [None]:
validationSequenceTensor, validationSequenceTensor.shape

: 

### Using the Data Loader in PyTorch to Load the Dataset

#### Define Hyper-parameter(s)

In [None]:
batchSize = 16

: 

#### Create Training Tensors

In [None]:
# Wrapping the training tensors
trainingTensor = TensorDataset(trainSequenceTensor, trainMaskTensor, trainYTensor)

# Randomly Sampling the Wrapped Tensor
trainingSampler = RandomSampler(trainingTensor)

# Putting the training sampled data in a data loader
trainingDataLoader = DataLoader(trainingTensor, sampler=trainingSampler, batch_size=batchSize)

: 

In [None]:
type(trainingTensor), type(trainingSampler), type(trainingDataLoader)

: 

#### Now, the same for Validation Tensors

In [None]:
# Wrapping the validation tensors
validationTensor = TensorDataset(validationSequenceTensor, validationMaskTensor, validationYTensor)

# Randomly Sampling the Wrapped Tensor
validationSampler = RandomSampler(validationTensor)

# Putting the training sampled data in a data loader
validationDataLoader = DataLoader(validationTensor, sampler=validationSampler, batch_size=batchSize)

: 

### Write Tensors to a JSON File

In [None]:
def saveToJSON(filePath, **kwargs):
    """
    Save PyTorch tensors to a JSON file.

    Args:
        file_path (str): Path to the JSON file to save to.
        **kwargs: Key-value pairs where the key is the name of the tensor and the value is the tensor to save.
    """
    toSave = {}
    
    for name, variable in kwargs.items():
        variableType = type(variable)
        
        if variableType == list or variableType == int:
            toSave[name] = variable
        else:
            toSave[name] = variable.tolist()
    
    with open(filePath, 'w') as f:
        json.dump(toSave, f)

: 

### Construct the BERT Model

In [None]:
# Freeze all the parameters
for parameter in BERT.parameters():
    parameter.requires_grad = False

: 

In [None]:
class BERTArchitecture(nn.Module):
    def __init__(self, bert):
        super(BERTArchitecture, self).__init__()
        
        self.bert = bert 
        
        # Dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # ReLU activation function
        self.relu =  nn.ReLU()

        # Dense layer 1
        self.fullyConnected1 = nn.Linear(768, 512)
      
        # Dense layer 2 (Output layer)
        self.fullyConnected2 = nn.Linear(512, 2)

        # Softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    # Define the forward pass
    def forward(self, sent_id, mask):
        # Pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        # Input layer
        x = self.fullyConnected1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # Output layer
        x = self.fullyConnected2(x)
      
        # Apply softmax activation
        x = self.softmax(x)

        return x

: 

### Pass the Pre-trained BERT from Huggingface to our Defined Architecture

In [None]:
model = BERTArchitecture(BERT)

: 

#### Push our Model to the Device

In [None]:
model = model.to(device)

: 

### Create an Optimiser

In [None]:
# Optimizer from hugging face transformers
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

: 

### Compute Class Weights

In [None]:
weightsList = compute_class_weight(class_weight='balanced', classes=np.unique(yTrain), y=yTrain)

print("Class Weights:", weightsList)

: 

#### Convert Class Weights List to Tensor

In [None]:
# Converting list of class weights to a tensor
weights = torch.tensor(weightsList, dtype=torch.float)

# Push to GPU
weights = weights.to(device)

: 

### Define Hyper-parameters to Train

In [None]:
# Define the loss function
crossEntropy  = nn.NLLLoss(weight=weights) 

# Define the number of training epochs
EPOCHS = 500

# Define how many steps before printing an update
trainingStepsUpdate = 20
validationStepsUpdate = 10

: 

### Training the Model - Fine Tuning
Define a function to train the model.

In [None]:
def train():
    model.train()
    totalLoss = 0
  
    # Empty list to save model predictions
    totalPredictions = []
  
    # Iterate over batches
    for step, batch in enumerate(trainingDataLoader):
        # Progress update after every 50 batches.
        if step % trainingStepsUpdate == 0 and not step == 0:
            print('\tBatch {:>3,} of {:>3,}.'.format(step, len(trainingDataLoader)))
        
        # Push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # Clear previously calculated gradients 
        model.zero_grad()        

        # Get model predictions for the current batch
        preds = model(sent_id, mask)

        # Compute the loss between actual and predicted values
        loss = crossEntropy(preds, labels)

        # Add on to the total loss
        totalLoss = totalLoss + loss.item()

        # Backward pass to calculate the gradients
        loss.backward()

        # Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()

        # Model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()

    # Append the model predictions
    totalPredictions.append(preds)

    # Compute the training loss of the epoch
    averageLoss = totalLoss / len(trainingDataLoader)
  
    # Predictions are in the form of (no. of batches, size of batch, no. of classes). Reshape the predictions in form of (number of samples, no. of classes)
    totalPredictions  = np.concatenate(totalPredictions, axis=0)

    # Returns the loss and predictions
    return averageLoss, totalPredictions

: 

### Evaluating the Model - Using the Validation Set
Define a function to evaluate the model.

In [None]:
def evaluate():
    print("\nEvaluating...")
  
    # Deactivate dropout layers
    model.eval()

    totalLoss = 0
    
    # Empty list to save the model predictions
    totalPredictions = []

    # Iterate over batches
    for step, batch in enumerate(validationDataLoader):
        # Progress update every 50 batches.
        if step % validationStepsUpdate == 0 and not step == 0:
            # Report progress.
            print('\tBatch {:>3,} of {:>3,}.'.format(step, len(validationDataLoader)))

        # Push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # Deactivate autograd
        with torch.no_grad():
            
            # Model predictions
            preds = model(sent_id, mask)

            # Compute the validation loss between actual and predicted values
            loss = crossEntropy(preds,labels)

            totalLoss = totalLoss + loss.item()

            preds = preds.detach().cpu().numpy()

            totalPredictions.append(preds)

    # Compute the validation loss of the epoch
    averageLoss = totalLoss / len(validationDataLoader) 

    # Reshape the predictions in form of (number of samples, no. of classes)
    totalPredictions  = np.concatenate(totalPredictions, axis=0)

    return averageLoss, totalPredictions

: 

### Running the Model to Train and Evaluate

In [None]:
# Set initial loss to infinite
bestValidationLoss = float('inf')

# Empty lists to store training and validation loss of each epoch
trainingLosses = []
validationLosses = []

# Initialize total time taken to 0
totalTimeTaken = 0

# For each epoch
for epoch in range(EPOCHS):
    print('\nEpoch {:} of {:}'.format(epoch + 1, EPOCHS))
    
    # Train model and record time taken
    startTime = time.time()
    trainingLoss, _ = train()
    trainingTimeTaken = time.time() - startTime

    # Evaluate model and record time taken
    startTime = time.time()
    validationLoss, _ = evaluate()
    validationTimeTaken = time.time() - startTime

    # Save the best model
    if validationLoss < bestValidationLoss:
        bestValidationLoss = validationLoss
        
        # When running the notebook on a Kaggle kernel
        # torch.save(model.state_dict(), r'/kaggle/working/weights.pt')
        
        # When saving the weights locally
        torch.save(model.state_dict(), r'./assets/weights/weights.pt')
    
    # Append training and validation losses
    trainingLosses.append(trainingLoss)
    validationLosses.append(validationLoss)
    
    # Print epoch results and times taken
    print(f'\nTraining Loss: {trainingLoss:.3f}')
    print(f'Training Time Taken: {trainingTimeTaken:.2f} seconds')
    print(f'Validation Loss: {validationLoss:.3f}')
    print(f'Validation Time Taken: {validationTimeTaken:.2f} seconds')

    # Update total time taken
    totalTimeTaken += trainingTimeTaken + validationTimeTaken

: 

In [None]:
# Print total time taken for all epochs
print(f'\nTotal Time Taken: {totalTimeTaken:.2f} seconds | {(totalTimeTaken / 60):.2f} minutes | {(totalTimeTaken / 3600):.2f} hours')

: 

In [None]:
# Save tensors to a JSON file
saveToJSON('./assets/tensors/tensors.json', testSequenceTensor=testSequenceTensor, testMaskTensor=testMaskTensor, testYTensor=testYTensor, trainingLossTensor=trainingLosses, validationLossTensor=validationLosses, epochs=EPOCHS)

: 

---