Here are the functions we will be using during our model training: 

In [None]:
##Create a loop to add the txt files into a data frame
def txt_retrieval(folder_path):
    qa = []
    txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for file in txt_files:
        file_path = os.path.join(folder_path, file) 
        df = pd.read_json(file_path) 
        df["source_file"] = file  
        qa.append(df)
    return pd.concat(qa, ignore_index=True) if qa else pd.DataFrame()


#Combine questions and answers to pass to the model
def qa_pairs(questions, options):
    pairs = []
    for q, opts in zip(questions, options):
        for opt in opts:
            pairs.append((q,opt))
    return pairs


##Use the tokenizer to encode the text
def encode(data_component):
        encoded_data = tokenizer(data_component, return_tensors='pt', padding=True)
        return encoded_data


##Computes scaled dot product attention on query, key and value tensors (pytorch docs)
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled += mask
    attention = F.softmax(scaled, dim = -1)
    values = torch.matmul(attention, v)
    return values, attention


def accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)  # Get class with highest probability
    return (preds == labels).float().mean().item()



##Training Loop
def train(bert, device, training_data, criterion, optimizer, epoch):
    #Set Model to training mode
    bert.train()
    # Initialize the running loss and accuracy
    training_acc_list = []
    training_loss = 0.0
    training_acc = 0.0
    #loop over the batches of data:
    for i, (inputs, labels) in enumerate(trainloader):
        #move the inputs and labels to the device
        inputs = {
        "input_ids": inputs.to(device),
        "attention_mask": (inputs != 0).long().to(device)
    }
        labels = labels.to(device)
        # Zero the parameter gradients
        optimizer.zero_grad()
        #forward pass
        outputs = bert_classifier(inputs) #get the output lgits from the model
        loss = criterion(outputs, labels) # Calculate the loss
        #Backward pass and optimize
        loss.backward() #compute the gradients
        optimizer.step() # Update the parameters
        #Print the statistics
        training_loss += loss.item()
        training_acc+= accuracy(outputs, labels)# accumulate the accuracy and loss
        if (i + 1) % 200 == 0: # print every two hundred batches
            print(f'Epoch {epoch}, Batch {i+1}, Loss: {training_loss / 200:.4f}, Accuracy: {training_acc / 200:.4f}')

            training_loss = 0.0
            training_acc = 0.0

#Testing Loop        
def test(bert, device, training_data, criterion, epoch):
    # Set the model to evaluation mode
    bert.eval()
    #Intitialize loss and accuracy
    test_loss = 0.0
    test_acc = 0.0
    # Loop over the batches of data
    with torch.no_grad(): 
        for inputs, labels in training_data:
            #move the inputs and labels to the 
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Forward pass
            outputs = bert(inputs) # Get the output logits from the model
            loss = criterion(outputs, labels) # Calculate the loss
            # Print the statistics
            test_loss += loss.item() # Accumulate the loss
            test_acc += accuracy(outputs, labels) # Accumulate the accuracy
    
    accuracy_x_epoch = pd.test_acc({test_acc}, index=epoch)
    
    
    # Print the average loss and accuracy
    print(f'Test Loss: {test_loss / len(test_loader):.4f}, Test Accuracy: {test_acc / len(test_loader):.4f}')



 



Import the question sets that will be used to train the model.  The first dataset is the RACE dataseet, which consists of multiple choice questions separated between M (middle school) and H (high school)

In [None]:
import pandas as pd
import os
import numpy as np



middle = "middle"
high = "high\high"


# Assign separate outputs based on the variable names
m_qa = txt_retrieval(middle)
h_qa = txt_retrieval(high)

Let's take a look at the column names to see how the data is structured.

In [None]:
m_qa.columns

In [None]:
h_qa

Separate the data into its components.

In [None]:
m_questions = m_qa.questions.values.tolist()
h_questions = h_qa.questions.values.tolist()
m_options = m_qa.options.values.tolist()
h_options = h_qa.options.values.tolist()
m_article = m_qa.article.values.tolist()
h_article = h_qa.article.values.tolist()
m_id = m_qa.id.values.tolist()
h_id = h_qa.id.values.tolist()
m_answers = m_qa.answers.values.tolist()
h_answers = h_qa.answers.values.tolist()

Import Bert from transformers and use the tokenizer specialized for the model to input text as tokens. 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertConfig, BertModel, AutoModel, AutoTokenizer
bert = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  


#m_qa_pairs = qa_pairs(m_questions, m_options)
#m_qa_pairs



#m_qa_inputs = encode(m_qa_pairs)
#torch.save(m_qa_inputs, "m_qa_inputs.pt")


#m_answers_inputs = encode(m_answers)
#torch.save(m_answers_inputs, "m_answers_inputs.pt")


#m_readings_inputs = encode(m_article)
#torch.save(m_readings_inputs, "m_readings_inputs.pt")

Encode questions, option, and answer components.   Save the encoded QA inputs to avoid the time consumption required from the tokenization.  


In [None]:
#h_qa_pairs = qa_pairs(h_questions, h_options)
#h_qa_pairs

In [None]:

#h_qa_inputs = encode(h_qa_pairs)
#torch.save(h_qa_inputs, "h_qa_inputs.pt")


#h_answers_inputs = encode(h_answers)
#torch.save(h_answers_inputs, "h_answers_inputs.pt")


#h_readings_inputs = encode(h_article)
#torch.save(h_readings_inputs, "h_readings_inputs.pt")

We will start training our model by using two Multi-Head Attention networks to compare the questions and the answer sequences.  We will first set up the training parameters for the networks. 

Sequence length is set to be the max size input in the high school dataset.  
Batch Size is the number of times that the network will run through the data in a training session.
Input dim is the vector dimension.  This sets the number of dimensions that the network uses.  
D model is the output of attention model for all of the inputs
m qa training is coming from the tokenized questions and answers that we saved earlier to the 

In [None]:
h_qa['article'].str.len().max()

Here I load in and label the data from before and split the data into training, testing, and validation data.  

In [None]:
from torch.utils.data import random_split 
m_qa_inputs = torch.load("m_qa_inputs.pt", weights_only=False)
h_qa_inputs = torch.load("h_qa_inputs.pt", weights_only=False)
m_qa_tensors = m_qa_inputs['input_ids']
h_qa_tensors = h_qa_inputs['input_ids']
m_labels = torch.zeros(len(m_qa_tensors), dtype=torch.long)  # Middle school = 0
h_labels = torch.ones(len(h_qa_tensors), dtype=torch.long)   #High school = 1
labels = torch.cat([m_labels, h_labels], dim=0)

max_seq_len = max(m_qa_tensors.shape[1], h_qa_tensors.shape[1])
m_qa_tensors = torch.nn.functional.pad(m_qa_tensors, (0, max_seq_len - m_qa_tensors.shape[1]))
h_qa_tensors = torch.nn.functional.pad(h_qa_tensors, (0, max_seq_len - h_qa_tensors.shape[1]))
training_data = torch.cat([m_qa_tensors, h_qa_tensors], dim=0)
total_size=training_data.shape[0]
train_size = int(.6 * total_size)
val_size = int(.3 * total_size)
test_size = total_size - train_size - val_size

train_data, val_data, test_data = random_split(training_data, [train_size, val_size, test_size])



 

Create the neural network:

In [None]:
class MultiheadAttention_1(nn.Module):
    def __init__(self, input_dim, d_model, num_heads):
        super().__init()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(input_dim , 3 * self.head_dim)
        self.linear_layer = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask=None):
        batch_size, sequence_length, input_dim = x.size()
        
        qkv = self.qkv_layer(x)
        
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        
        qkv = qkv.permute(0, 2, 1, 3)
        
        q, k, v = qkv.chunk(3, dim=-1)
        
        values, attention = scaled_dot_product(q, k, v, mask)
        
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        
        h_1 = self.linear_layer(values)
       
        return h_1

class MultiheadAttention_2(nn.Module):
    def __init__(self, input_dim, d_model, num_heads):
        super().__init()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(input_dim , 3 * self.head_dim)
        self.linear_layer = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask=None):
        batch_size, sequence_length, input_dim = x.size()
        
        qkv = self.qkv_layer(x)
        
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)

        qkv = qkv.permute(0, 2, 1, 3)

        q, k, v = qkv.chunk(3, dim=-1)

        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)

        h = self.linear_layer(values)

        return h
    

Create your hyperparameters for training:

In [None]:
import torch.optim as optim
epochs = 100
criterion = nn.CrossEntropyLoss()
optimizer  = optim.Adam(bert.parameters(), lr=2e-5, betas=(0.9, 0.999), weight_decay=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   
sequence_length = 3714
batch_size = 50
input_dim = 500
d_model = 512
m_qa_training_parameters = torch.randn(input_dim, batch_size, sequence_length)


Wrap the data in a DataLoader

In [None]:

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

dataset = TensorDataset(training_data, labels) 

trainloader = DataLoader(dataset, batch_size=32, shuffle=True)



testloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

In [None]:
print(labels.shape, training_data.shape)


Move the model to GPU

In [None]:
bert.to(device)

Call the function for training and pass the parameters: 

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert, hidden_dim=768, num_classes=2):  # Adjust num_classes as needed
        super(BertClassifier, self).__init__()
        self.bert = bert
        self.classifier = nn.Linear(hidden_dim, num_classes)  # Maps hidden_dim to num_classes

    def forward(self, inputs):
        outputs = self.bert(**inputs)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)  # Pass pooled output to classifier
        return logits  # Now shaped (batch_size, num_classes)


In [None]:
bert_classifier = BertClassifier(bert).to(device)


In [None]:
train(bert_classifier, device, trainloader, criterion, optimizer, epochs)

Save the model weights to use the classifier at another time.

In [None]:
import torch
torch.save(bert_classifier.state_dict(), 'Bert_Classifier')