Here are the functions used during data preprocessing, training, and testing:

In [None]:
##Use the tokenizer to encode the text
def encode(data_component):
        encoded_data = tokenizer(data_component, return_tensors='pt', padding=True)
        return encoded_data


def qa_pairs(questions, options):
    pairs = []
    for q, opts in zip(questions, options):
        for opt in opts:
            pairs.append((q,opt))
    return pairs


##outputs the accuracy of the model
def accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)  # Get class with highest probability
    return (preds == labels).float().mean().item()



##Training Loop
def train(bert, device, training_data, criterion, optimizer, epoch):
    #Set Model to training mode
    bert_classifier.train()
    
    
    for epoch in range(epochs):
        epoch_loss = 0.0  # To accumulate loss for this epoch
        training_acc = 0.0  # Accumulate accuracy
        training_loss = 0.0
    
    #loop over the batches of data:
        for i, batch in enumerate(trainloader):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            inputs = {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
        
        
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            
            #forward pass
            outputs = bert(inputs) #get the output logits from the model
            logits = bert_classifier(inputs)
            
            
            #calculate the loss
            loss = criterion(logits, labels) # Calculate the loss
            
            
            #Backward pass and optimize
            loss.backward() #compute the gradients
            optimizer.step() # Update the parameters
            
            
            # Accumulate loss for this batch
            epoch_loss += loss.item()
            
            #Print the statistics
            training_loss += loss.item()
            training_acc+= accuracy(outputs, labels)# accumulate the accuracy and loss
            if (i + 1) % 200 == 0: # print every two hundred batches
                print(f'Epoch {epoch}, Batch {i+1}, Loss: {training_loss / 200:.4f}, Accuracy: {training_acc / 200:.4f}')

                training_loss = 0.0
                training_acc = 0.0


#Testing Loop        
def test(model, device, testloader, criterion, epoch):
    # Set the model to evaluation mode
    bert_classifier.eval()
    TestAccuracy = []
    correct_predictions = ()
    
    
    for epoch in range(epochs):
        epoch_loss = 0.0  # To accumulate loss for this epoch
        testing_acc = 0.0  # Accumulate accuracy
        testing_loss = 0.0
        
        

    #loop over the batches of data:
        with torch.no_grad():
            for i, batch in enumerate(testloader):
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)
                inputs = {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask
                }
                
                
                #forward pass
                outputs = model(inputs)
                logits = model(inputs)
            
                
                #calculate the loss
                loss = criterion(logits, labels) # Calculate the loss
               
                
                # Accumulate loss for this batch
                epoch_loss += loss.item()
                testing_loss += loss.item()
                testing_acc+= accuracy(outputs, labels)

                                
                if (i + 1) % 200 == 0: # print every two hundred batches
                    print(f'Epoch {epoch}, Batch {i+1}, Loss: {testing_loss / 200:.4f}, Accuracy: {testing_acc / 200:.4f}')

Import all the necessary models:

In [2]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import random_split 
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from transformers.models.bert.modeling_bert import BertIntermediate, BertOutput
from transformers.models.bert.modeling_bert import BertEncoder
from transformers.models.bert.modeling_bert import BertModel, BertConfig
import matplotlib.pyplot as plt

bert = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Import the question sets that will be used to train the model.  The first dataset used was the RACE dataseet, which consists of multiple choice questions separated between M (middle school) and H (high school).  The second dataset is QxGrade_dataset which is a set of 75k questions scraped from pdf textbooks.  These textbooks were chosen based on alignment with Common Core State Standards to identify a framework that we can use when training the model with additional data. 

In [None]:
#df = pd.read_csv('QxGrade_Dataset.csv')

The two most important columns we will be using and labeling are Grade and Question.  Using the .values and .tolist function here we are adding all of the grade options (3-12) to the grades function.  We are doing the same with all of the question values.

In [None]:
#questions = df.Question.values.tolist()
#grades = df.Grade.values.tolist()

Use QA pairs to create a list of questions and their grade levels.  

In [None]:
#qa_pairs = qa_pairs(grades, questions)

Tokenize the data using the encode function

In [None]:
#qa_pairs = encode(qa_pairs)

{'input_ids': tensor([[ 101, 1017,  102, 1015,  102],
        [ 101, 1017,  102, 1012,  102],
        [ 101, 1017,  102, 1014,  102],
        ...,
        [ 101, 1022,  102, 1050,  102],
        [ 101, 1022,  102, 1037,  102],
        [ 101, 1022,  102, 1050,  102]]), 'token_type_ids': tensor([[0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1],
        ...,
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        ...,
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

Save the qa_pairs using torch.save

In [None]:
#torch.save(qa_pairs, "qa_pairs.pt")

Now set the grade values to 0

In [12]:
categories = 'qa_pairs.pt'

In [14]:
glabels = torch.zeros(len(categories), dtype=torch.float32)

Set the question values to 1

In [20]:
questions = torch.load("questions.pt", weights_only=False)

In [26]:
qlabels = torch.ones(len(questions), dtype=torch.float32)

Concatenate the two

In [29]:
training_data = torch.cat([qlabels, glabels])

Set up the proportion of the data you want to use for training, validation, and testing

In [31]:
total_size = training_data.shape[0]
train_size = int(.7 * total_size)
val_size = int(.2 * total_size)
test_size = total_size - train_size - val_size

Now use Random_Split to chunk the training data into training, validation, and testing.

In [32]:
train_data, val_data, test_data = random_split(training_data, [train_size, val_size, test_size])


Now we will set the training hyperparameters

In [33]:
epochs = 5  ##How many times we go through the loop
criterion = nn.CrossEntropyLoss()  ##This compares the predicted answer with the correct answer
optimizer = optim.Adam(bert.parameters(), lr=1e-5, betas=(0.9, 0.999), weight_decay=1e-5)  ##The Optim.Adam optimizer calculates gradient descent
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  ##Set the device to GPU so we can train the model on the GPU

sequence_length = 200   ## Maximum length of tokens to be used at a time
batch_size = 64  ##The number of training examples in one forward/backward pass
input_dim = 500  ##The total number of dimension we will allow the model to use for calculation
d_model = 512  ##Number of expected features, set to default recommended by pytorch

Change BERT configuration to implement Dual Multihead Attention Mechanism.  This is also where we are implementing our customized neural network. 

In [40]:
##This implements a copy of the original attention mechnism to run simultaneously, then at the end, the outputs are joined together
class DualBertAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.attention1 = BertSelfAttention(config)
        self.attention2 = BertSelfAttention(config)
        
        self.output1 = BertSelfOutput(config)
        self.output2 = BertSelfOutput(config)
        
    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attn_output1, _ = self.attention1(hidden_states, attention_mask, head_mask)
        attn_output1 = self.output1(attn_output1, hidden_states)
        
        attn_output2, _ = self.attention2(hidden_states, attention_mask, head_mask)
        attn_output2 = self.output2(attn_output2, hidden_states)
        dual_attention_output = F.relu(attn_output1 + attn_output2)
        return dual_attention_output
    
##Implements the dual attention in the Neural Network
class DualBertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = DualBertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)


##Moves the data through the Neural Network
    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_output = self.attention(hidden_states, attention_mask, head_mask)  ##Layer 1
        intermediate_output = self.intermediate(attention_output)  ##Layer 2
        layer_output = self.output(intermediate_output, attention_output)  
        return layer_output


##Implments the outcome from the DualBertLayer to encode the data from the DualBertLayer Class
class DualBertEncoder(BertEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.layer = nn.ModuleList([DualBertLayer(config) for _ in range(config.num_hidden_layers)])


##Implements the model with the DualBertEncoder
class DualBertModel(BertModel):
    def __init__(self, config):
        super().__init__(config)
        self.encoder = DualBertEncoder(config)


##Initialize a classifier between the 10 different options (3rd grade to 12th grade).

class BertClassifier(nn.Module):
    def __init__(self, bert, hidden_dim=768, num_classes=10): 
        super(BertClassifier, self).__init__()
        self.bert = bert
        self.classifier = nn.Linear(hidden_dim, num_classes) 

    def forward(self, inputs):
        outputs = self.bert(**inputs)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits  

Wrap the data in a Dataloader:

In [37]:
dataset = TensorDataset(training_data, (training_data != 0).long(), labels) 

trainloader = DataLoader(dataset, batch_size=32, shuffle=True)



testloader = DataLoader(dataset, batch_size=32, shuffle=True )

Move the model to the GPU

In [38]:
bert.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

Move the classifier to the GPU as well

In [39]:
bert_classifier = BertClassifier(bert).to(device)

Use Bert to classify the data

In [None]:
#train(bert_classifier, device, trainloader, criterion, optimizer, epochs)