Here are the functions we will be using during our model training: 

In [1]:
##Create a loop to add the txt files into a data frame
def txt_retrieval(folder_path):
    qa = []
    txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for file in txt_files:
        file_path = os.path.join(folder_path, file) 
        df = pd.read_json(file_path) 
        df["source_file"] = file  
        qa.append(df)
    return pd.concat(qa, ignore_index=True) if qa else pd.DataFrame()


#Combine questions and answers to pass to the model
def qa_pairs(questions, options):
    pairs = []
    for q, opts in zip(questions, options):
        for opt in opts:
            pairs.append((q,opt))
    return pairs


##Use the tokenizer to encode the text
def encode(data_component):
    for i in data_component: 
        encoded_data = tokenizer(data_component, return_tensors='pt', padding=True)
    return encoded_data







Import the question sets that will be used to train the model.  The first dataset is the RACE dataseet, which consists of multiple choice questions separated between M (middle school) and H (high school)

In [2]:
import pandas as pd
import os

middle = "middle"
high = "high\high"


# Assign separate outputs based on the variable names
m_qa = txt_retrieval(middle)
h_qa = txt_retrieval(high)
    

Let's take a look at the column names to see how the data is structured. 

In [3]:
m_qa.columns

Index(['answers', 'options', 'questions', 'article', 'id', 'source_file'], dtype='object')

In [4]:
h_qa

Unnamed: 0,answers,options,questions,article,id,source_file
0,B,[affected only the companies doing business wi...,The Sherman Antitrust Act _ .,One thinks of princes and presidents as some o...,high10024.txt,10024.txt
1,A,[are more likely to exist in a competitive mar...,One might infer from this passage that lower p...,One thinks of princes and presidents as some o...,high10024.txt,10024.txt
2,D,[believed that the trusts had little influence...,It seems likely that many Americans _ .,One thinks of princes and presidents as some o...,high10024.txt,10024.txt
3,C,"[buy high-quality products, communicate with f...",Bargaining is a skill to _ .,"Everything in China is negotiable, so goes the...",high10042.txt,10042.txt
4,A,"[rising incomes, an increasing number of produ...","In China, the younger generation is losing int...","Everything in China is negotiable, so goes the...",high10042.txt,10042.txt
...,...,...,...,...,...,...
812,C,"[The sensor size., The zoom range., The shutte...",What will contribute to a satisfactory photo o...,If you are a traditional traveller who believe...,high15028.txt,15028.txt
813,A,"[How to choose ideal travel cameras?, How to p...",Which of the following can be the best title o...,If you are a traditional traveller who believe...,high15028.txt,15028.txt
814,D,"[A growth mindstet means no failure., People n...",What is the author's opinion of people' s mind...,Fixed or growth mindset -- which do you have?\...,high15042.txt,15042.txt
815,C,"[You are clever., You are skillful., You have ...",Which judgment seems more encouraging?,Fixed or growth mindset -- which do you have?\...,high15042.txt,15042.txt


Separate the data into its components.

In [5]:
m_questions = m_qa.questions.values.tolist()
h_questions = h_qa.questions.values.tolist()
m_options = m_qa.options.values.tolist()
h_options = h_qa.options.values.tolist()
m_article = m_qa.article.values.tolist()
h_article = h_qa.article.values.tolist()
m_id = m_qa.id.values.tolist()
h_id = h_qa.id.values.tolist()
m_answers = m_qa.answers.values.tolist()
h_answers = h_qa.answers.values.tolist()

Import Bert from transformers and use the tokenizer specialized for the model to input text as tokens. 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertConfig, BertModel, AutoModel, AutoTokenizer
##bert = AutoModel.from_pretrained("bert-base-uncased")
##tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  

In [7]:
      
##m_qa_pairs = qa_pairs(m_questions, m_options)
##m_qa_pairs

Encode questions, option, and answer components.   Save the encoded QA inputs to avoid the time consumption required from the tokenization.  

In [8]:
##m_qa_inputs = encode(m_qa_pairs)

In [9]:
##torch.save(m_qa_inputs, "m_qa_inputs.pt")

Now do the same for the correct answers:

In [10]:
##m_answers_inputs = encode(m_answers)

In [11]:
##torch.save(m_answers_inputs, "m_answers_inputs.pt")

And for the readings:

In [12]:
##m_readings_inputs = encode(m_article)


In [13]:
##torch.save(m_readings_inputs, "m_readings_inputs.pt")

We will start training our model by using two Multi-Head Attention networks to compare the questions and the answer sequences.  We will first set up the training parameters for the networks. 

Sequence length is set to be the max size input in the high school dataset.  
Batch Size is the number of times that the network will run through the data in a training session.
Input dim is the vector dimension.  This sets the number of dimensions that the network uses.  
D model is the output of attention model for all of the inputs
m qa training is coming from the tokenized questions and answers that we saved earlier to the 

In [14]:
h_qa['article'].str.len().max()

np.int64(3714)

In [15]:
sequence_length = 3714
batch_size = 10
input_dim = 500
d_model = 512
m_qa_training = torch.load("m_qa_inputs.pt", weights_only=False)
m_qa_training_parameters = torch.randn(input_dim, batch_size, sequence_length)

Make sure the dimensions are equal.  

In [16]:
m_qa_training_parameters.size()

torch.Size([500, 10, 3714])

Build the Query Key Value processing layer

In [17]:
qkv_layer = nn.Linear(input_dim , 3 * d_model)

In [18]:
qkv = qkv_layer(x)

NameError: name 'x' is not defined

In [None]:
qkv.shape

torch.Size([10, 3714, 1536])

Create the eight attention heads, set up their dimensions, apply the qkv layer.  Reorder the data so it is inputed by batchsize, numheads sequence length, and head_dim by the QKV value

In [None]:
num_heads = 8
head_dim = d_model//num_heads
qkv = qkv.reshape(batch_size, sequence_length, num_heads, 3*head_dim)

In [None]:
qkv.permute(0,2,1,3)

In [None]:
qkv.shape

torch.Size([10, 3714, 8, 192])

In [None]:
q, k, v = qkv.chunk(3, dim=-1)
q.shape, k.shape, v.shape

(torch.Size([10, 3714, 8, 64]),
 torch.Size([10, 3714, 8, 64]),
 torch.Size([10, 3714, 8, 64]))

$$
\text{self attention} = \text{softmax} \left( \frac{Q K^T}{\sqrt{d_k}} + M \right)
$$

Then, the updated value matrix is obtained as:

$$
\text{new } V = \text{self attention} \cdot V
$$

In [None]:
import math
d_k = q.size()[-1]
scaled = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(d_k)
scaled.shape


torch.Size([10, 3714, 8, 8])

Create the masking layer:

In [None]:
mask = torch.full(scaled.size() , float('-inf'))
mask = torch.triu(mask, diagonal=1)
mask[0][1]

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
(scaled + mask)[0][0]

tensor([[-0.2609,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.1441, -0.4110,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.1988,  0.2117,  0.3794,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.2249,  0.1411,  0.4591, -0.4204,    -inf,    -inf,    -inf,    -inf],
        [ 0.1424, -0.7792,  0.0121, -0.4248, -0.0755,    -inf,    -inf,    -inf],
        [ 0.0371, -0.2330, -0.2173, -0.1348,  0.0426,  0.0537,    -inf,    -inf],
        [ 0.2725, -0.0192, -0.2573,  0.0139,  0.0182,  0.3217, -0.1797,    -inf],
        [ 0.3331, -0.4011, -0.7940,  0.3819,  0.3435,  0.2920,  1.2055, -0.2377]],
       grad_fn=<SelectBackward0>)

In [None]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled += mask
    attention = F.softmax(scaled, dim = -1)
    values = torch.matmul(attention, v)
    return values, attention

In [None]:
values, attention = scaled_dot_product(q, k, v, mask=None)

In [None]:
attention.shape

torch.Size([10, 3714, 8, 8])

In [None]:
values.size()

torch.Size([10, 3714, 8, 64])

In [None]:
values = values.reshape(batch_size, sequence_length, num_heads * head_dim)
values.size()

torch.Size([10, 3714, 512])

In [None]:
linear_layer = nn.Linear(d_model, d_model)

In [None]:
out = linear_layer(values)

In [None]:
out.shape

torch.Size([10, 3714, 512])