Here are the functions we will be using for the training:

In [16]:
##Create a loop to add the txt files into a data frame
def txt_retrieval(folder_path):
    qa = []
    txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for file in txt_files:
        file_path = os.path.join(folder_path, file) 
        df = pd.read_json(file_path) 
        df["source_file"] = file  
        qa.append(df)
    return pd.concat(qa, ignore_index=True) if qa else pd.DataFrame()


#Combine questions and answers to pass to the model
def qa_pairs(questions, options):
    pairs = []
    for q, opts in zip(questions, options):
        for opt in opts:
            pairs.append((q,opt))
    return pairs


##Use the tokenizer to encode the text
def encode(data_component):
        encoded_data = tokenizer(data_component, return_tensors='pt', padding=True)
        return encoded_data


##outputs the accuracy of the model
def accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)  # Get class with highest probability
    return (preds == labels).float().mean().item()



##Training Loop
def train(bert, device, training_data, criterion, optimizer, epoch):
    #Set Model to training mode
    bert_classifier.train()
    
    
    for epoch in range(epochs):
        epoch_loss = 0.0  # To accumulate loss for this epoch
        training_acc = 0.0  # Accumulate accuracy
        training_loss = 0.0
    
    #loop over the batches of data:
        for i, batch in enumerate(trainloader):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            inputs = {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
        
        
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            
            #forward pass
            outputs = bert(inputs) #get the output lgits from the model
            logits = bert_classifier(inputs)
            
            
            #calculate the loss
            loss = criterion(logits, labels) # Calculate the loss
            
            
            #Backward pass and optimize
            loss.backward() #compute the gradients
            optimizer.step() # Update the parameters
            
            
            # Accumulate loss for this batch
            epoch_loss += loss.item()
            
            #Print the statistics
            training_loss += loss.item()
            training_acc+= accuracy(outputs, labels)# accumulate the accuracy and loss
            if (i + 1) % 200 == 0: # print every two hundred batches
                print(f'Epoch {epoch}, Batch {i+1}, Loss: {training_loss / 200:.4f}, Accuracy: {training_acc / 200:.4f}')

                training_loss = 0.0
                training_acc = 0.0


#Testing Loop        
def test(model, device, testloader, criterion, epoch):
    # Set the model to evaluation mode
    bert_classifier.eval()
    TestAccuracy = []
    correct_predictions = ()
    
    
    for epoch in range(epochs):
        epoch_loss = 0.0  # To accumulate loss for this epoch
        testing_acc = 0.0  # Accumulate accuracy
        testing_loss = 0.0
        
        

    #loop over the batches of data:
        with torch.no_grad():
            for i, batch in enumerate(testloader):
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)
                inputs = {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask
                }
                
                
                #forward pass
                outputs = model(inputs)
                logits = model(inputs)
            
                
                #calculate the loss
                loss = criterion(logits, labels) # Calculate the loss
               
                
                # Accumulate loss for this batch
                epoch_loss += loss.item()
                testing_loss += loss.item()
                testing_acc+= accuracy(outputs, labels)

                                
                if (i + 1) % 200 == 0: # print every two hundred batches
                    print(f'Epoch {epoch}, Batch {i+1}, Loss: {testing_loss / 200:.4f}, Accuracy: {testing_acc / 200:.4f}')
                
            
            
            
            
            


Import all the necessary modules:

In [17]:
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import random_split 
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from transformers.models.bert.modeling_bert import BertIntermediate, BertOutput
from transformers.models.bert.modeling_bert import BertEncoder
from transformers.models.bert.modeling_bert import BertModel, BertConfig
import matplotlib.pyplot as plt


bert = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Import the question sets that will be used to train the model.  The first dataset is the RACE dataseet, which consists of multiple choice questions separated between M (middle school) and H (high school)

In [18]:
middle = "middle"
high = "high\high"


# Assign separate outputs based on the variable names
m_qa = txt_retrieval(middle)
h_qa = txt_retrieval(high)

Let's take a look at the column names to see how the data is structured and to make sure the data has correctly imported.

In [19]:
h_qa.columns
m_qa.columns

Index(['answers', 'options', 'questions', 'article', 'id', 'source_file'], dtype='object')

Focus on the question/answers, isolate them from the other columns.

In [20]:
m_questions = m_qa.questions.values.tolist()
h_questions = h_qa.questions.values.tolist()
m_options = m_qa.options.values.tolist()
h_options = h_qa.options.values.tolist()
m_answers = m_qa.answers.values.tolist()
h_answers = h_qa.answers.values.tolist()

Using the tokenizer specialized for Bert, encode the questions/answers

In [21]:
m_qa_pairs = qa_pairs(m_questions, m_options)
m_qa_inputs = encode(m_qa_pairs)
torch.save(m_qa_inputs, "m_qa_inputs.pt")


h_qa_pairs = qa_pairs(h_questions, h_options)
h_qa_inputs = encode(h_qa_pairs)
torch.save(h_qa_inputs, "h_qa_inputs.pt")


Load in and label data and split the data into testing, training, and validation data.

In [22]:
m_qa_inputs = torch.load("m_qa_inputs.pt", weights_only=False)
h_qa_inputs = torch.load("h_qa_inputs.pt", weights_only=False)
m_qa_tensors = m_qa_inputs['input_ids']
h_qa_tensors = h_qa_inputs['input_ids']

m_labels = torch.zeros(len(m_qa_tensors), dtype=torch.long)  # Middle school = 0
h_labels = torch.ones(len(h_qa_tensors), dtype=torch.long)   # High school = 1
labels = torch.cat([m_labels, h_labels], dim=0)

max_seq_len = max(m_qa_tensors.shape[1], h_qa_tensors.shape[1])
m_qa_tensors = torch.nn.functional.pad(m_qa_tensors, (0, max_seq_len - m_qa_tensors.shape[1]))
h_qa_tensors = torch.nn.functional.pad(h_qa_tensors, (0, max_seq_len - h_qa_tensors.shape[1]))
training_data = torch.cat([m_qa_tensors, h_qa_tensors], dim=0)

total_size = training_data.shape[0]
train_size = int(.6 * total_size)
val_size = int(.3 * total_size)
test_size = total_size - train_size - val_size

train_data, val_data, test_data = random_split(training_data, [train_size, val_size, test_size])

epochs = 5
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bert.parameters(), lr=1e-5, betas=(0.9, 0.999), weight_decay=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sequence_length = 200
batch_size = 64
input_dim = 500
d_model = 512


Change BERT configuration to implement Dual Multihead Attention Mechanism.  This is where we are implementing the neural network. 

In [23]:
class DualBertAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.attention1 = BertSelfAttention(config)
        self.attention2 = BertSelfAttention(config)
        
        self.output1 = BertSelfOutput(config)
        self.output2 = BertSelfOutput(config)
        
    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attn_output1, _ = self.attention1(hidden_states, attention_mask, head_mask)
        attn_output1 = self.output1(attn_output1, hidden_states)
        
        attn_output2, _ = self.attention2(hidden_states, attention_mask, head_mask)
        attn_output2 = self.output2(attn_output2, hidden_states)
        dual_attention_output = F.relu(attn_output1 + attn_output2)
        return dual_attention_output
    

class DualBertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = DualBertAttention(config)  # Use dual attention
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_output = self.attention(hidden_states, attention_mask, head_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class DualBertEncoder(BertEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.layer = nn.ModuleList([DualBertLayer(config) for _ in range(config.num_hidden_layers)])


class DualBertModel(BertModel):
    def __init__(self, config):
        super().__init__(config)
        self.encoder = DualBertEncoder(config)


class BertClassifier(nn.Module):
    def __init__(self, bert, hidden_dim=768, num_classes=2):  # Adjust num_classes as needed
        super(BertClassifier, self).__init__()
        self.bert = bert
        self.classifier = nn.Linear(hidden_dim, num_classes)  # Maps hidden_dim to num_classes

    def forward(self, inputs):
        outputs = self.bert(**inputs)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)  # Pass pooled output to classifier
        return logits  # Now shaped (batch_size, num_classes)
        


Wrap the data in a DataLoader:

In [24]:
dataset = TensorDataset(training_data, (training_data != 0).long(), labels) 

trainloader = DataLoader(dataset, batch_size=32, shuffle=True)



testloader = DataLoader(dataset, batch_size=32, shuffle=True )

Move the model to the GPU

In [25]:
bert.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [26]:
bert_classifier = BertClassifier(bert).to(device)

Use bert to classify the data

In [27]:
#train(bert_classifier, device, trainloader, criterion, optimizer, epochs)

Save the mdoel weights to avoid retraining: 

In [28]:
#torch.save(bert_classifier.state_dict(), 'Bert_Classifier.pt')

Import the saved model

In [29]:
model = BertClassifier(bert)

#Load Saved State Dictionary
state_dict = torch.load("Bert_Classifier.pt", map_location=torch.device("cpu"))
model.load_state_dict(state_dict)
model.to(device)
# Set model to evaluation mode
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

Begin Testing:

In [30]:
test(model, device, testloader, criterion, epochs)

Epoch 0, Batch 200, Loss: 0.0069, Accuracy: 0.9984
Epoch 1, Batch 200, Loss: 0.0058, Accuracy: 0.9988
Epoch 2, Batch 200, Loss: 0.0049, Accuracy: 0.9989
Epoch 3, Batch 200, Loss: 0.0062, Accuracy: 0.9988
Epoch 4, Batch 200, Loss: 0.0061, Accuracy: 0.9988
