# Application of BERT to Big-5 prediction from text

In this notebook I am re-implementing BERT step by step just for the kicks (then download the pre-trained model from HugginFace). Then I add a head for classification of Big-5 categories.

## References:
* https://arxiv.org/pdf/1810.04805.pdf - original BERT paper
* https://arxiv.org/pdf/1907.11692.pdf - RoBERTa paper
* https://openreview.net/pdf?id=rJ4km2R5t7 - GLUE paper
* https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial - hands-on implementation of BERT in PyTorch step by step
* https://pytorch.org/hub/huggingface_pytorch-transformers/ - PyTorch page on transformers
* https://habr.com/ru/post/680986/ - review of flavours of BERT (in Russian)
* https://discuss.pytorch.org/t/check-if-pytorch-is-using-metal-on-macbook/152481 - on accelerating PyTorch over Metal API on Mac M1

In [2]:
import math

import torch
import numpy as np
import pandas as pd


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


print(torch.__version__)
# print(torch.backends.mps.is_available())

1.12.1


## BERT implementation and training

First, I'll manually implement BERT architecture in PyTorch to make sure, we have a good feeling of how it works, how to train it etc.

In [163]:
text = (
   'Hello, how are you? I am Romeo.\n'
   'Hello, Romeo My name is Juliet. Nice to meet you.\n'
   'Nice meet you too. How are you today?\n'
   'Great. My baseball team won the competition.\n'
   'Oh Congratulations, Juliet\n'
   'Thanks you Romeo'
)

In [164]:
import re


# ugly, ugly pre-processing without glmnet
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
print(sentences)
print(word_list)

['hello how are you i am romeo', 'hello romeo my name is juliet nice to meet you', 'nice meet you too how are you today', 'great my baseball team won the competition', 'oh congratulations juliet', 'thanks you romeo']
['you', 'is', 'my', 'competition', 'won', 'great', 'team', 'hello', 'oh', 'too', 'romeo', 'the', 'i', 'am', 'how', 'meet', 'juliet', 'thanks', 'name', 'are', 'congratulations', 'baseball', 'to', 'today', 'nice']


In [165]:
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, w in enumerate(word_list):
    word_dict[w] = i + 4
    
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

# calculate maxlen
maxlen = 0
for sentence in sentences:
    if len(sentence.split()) > maxlen:
        maxlen = len(sentence)
        
# build token list
token_list = []
for sentence in sentences:
    token_list.append([])
    for token in sentence.split():
        token_list[-1].append(word_dict[token])

In [166]:
from random import randrange, shuffle, randint, random


def make_batch(
        sentences: list,
        batch_size: int,
        token_list: list,
        word_dict: dict,
        number_dict: dict,
        vocab_size: int,
        max_pred: int,
        maxlen: int
):    
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        # pick any two random sentences (not necessarily consecutive)
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) 

        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]

        # generate input embeddings out of them
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

        # mask first sentence with the goal of predicting, whether they are consecutive or not
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15))))  # mask ~15 % of tokens in one sentence
        cand_masked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]

        shuffle(cand_masked_pos)

        masked_tokens, masked_pos = [], []
        for pos in cand_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])

            if random() < 0.8:  # in 80% of the cases, mask the token
                input_ids[pos] = word_dict['[MASK]']  # make mask
            elif random() < 0.5:  # in 10% of the cases, replace the token with a random token from the vocabulary
                index = randint(4, vocab_size - 1)  # random index in vocabulary
                input_ids[pos] = index
            else:  # in 10% keep the token as is
                pass

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1

    return batch


batch = make_batch(
    sentences=sentences, 
    batch_size=64, 
    token_list=token_list, 
    word_dict=word_dict, 
    number_dict=number_dict, 
    vocab_size=vocab_size,
    max_pred=10,
    maxlen=maxlen
)
print(batch)

[[[1, 21, 3, 14, 2, 12, 24, 20, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0, 0, 0], False], [[1, 3, 6, 25, 10, 8, 15, 7, 2, 11, 18, 23, 3, 16, 17, 14, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 9, 4, 0, 0, 0, 0, 0, 0, 0], [7, 1, 12, 0, 0, 0, 0, 0, 0, 0], False], [[1, 9, 6, 25, 10, 8, 15, 7, 2, 3, 18, 23, 4, 16, 3, 24, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [17, 11, 14, 0, 0, 0, 0, 0, 0, 0], [14, 9, 15, 0, 0, 0, 0, 0, 0, 0], False], [[1, 12, 24, 20, 2, 11, 14, 6, 3, 5, 20, 28, 26, 19, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 22, 0, 0, 0, 0, 0, 0, 0, 0], [14, 8, 0, 0, 0, 0, 0,

In [167]:
# MODEL PARAMETERS
# ----------------

d_model = 256  # parameter, used everywhere below
d_ff = 256  # dimensionality of intermediate layer on feed-forward parts of encoder blocks
n_segments = 2  # each training data point consists of two sequences, for which we predict, whether they are consecutive
d_k = d_model  # in attention heads d_k is the size of W_K * K
d_v = d_model  # same idea as d_k, but for attention values
n_layers = 6  # number of encoder block layers
n_heads = 32  # number of heads per layer, 768 in BERT base 

In [168]:
from torch import nn


class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        
        return self.norm(embedding)

In [169]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
   
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
   
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

In [170]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)  # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs)  # enc_outputs: [batch_size x len_q x d_model]
        
        return enc_outputs, attn

In [171]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
       
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores, context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)

        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]


In [172]:
class PoswiseFeedForwardNet(nn.Module):
    """Copy-pasted from: https://github.com/Skumarr53/Attention-is-All-you-Need-PyTorch/blob/master/transformer/model.py"""
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.l1 = nn.Linear(d_model, d_ff)
        self.l2 = nn.Linear(d_ff, d_model)

        self.relu = gelu
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        residual = inputs

        output = self.l1(inputs)
        output = self.relu(output)
        output = self.l2(output)

        return self.layer_norm(output + residual)

In [173]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        
        return scores, context, attn

In [174]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

emb = Embedding()
embeds = emb(input_ids, segment_ids)

attenM = get_attn_pad_mask(input_ids, input_ids)

SDPA = ScaledDotProductAttention()(embeds, embeds, embeds, attenM)

S, C, A = SDPA

# print('Masks', masked[0][0])
print('Masks: \n', masked_tokens, masked_pos)
print()
print('Scores: \n', S[0][0],'\n\nAttention Scores after softmax: \n', A[0][0])

Masks: 
 tensor([[ 4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 7,  9,  4,  0,  0,  0,  0,  0,  0,  0],
        [17, 11, 14,  0,  0,  0,  0,  0,  0,  0],
        [ 4, 22,  0,  0,  0,  0,  0,  0,  0,  0],
        [17,  4,  0,  0,  0,  0,  0,  0,  0,  0],
        [20, 24,  0,  0,  0,  0,  0,  0,  0,  0],
        [10,  9, 15,  0,  0,  0,  0,  0,  0,  0],
        [24, 14,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5,  6,  0,  0,  0,  0,  0,  0,  0,  0],
        [14,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 6,  8,  9,  0,  0,  0,  0,  0,  0,  0],
        [ 7, 25,  0,  0,  0,  0,  0,  0,  0,  0],
        [26,  4,  0,  0,  0,  0,  0,  0,  0,  0],
        [14, 19, 19,  0,  0,  0,  0,  0,  0,  0],
        [18, 11, 28,  0,  0,  0,  0,  0,  0,  0],
        [ 6, 10, 14,  0,  0,  0,  0,  0,  0,  0],
        [10, 23,  4,  0,  0,  0,  0,  0,  0,  0],
        [ 8, 10,  0,  0,  0,  0,  0,  0,  0,  0],
        [19, 27,  0,  0,  0,  0,  0,  0,  0,  0],
        [23,  7, 15,  0,  0,  0,  0,  0, 

In [175]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)

        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        # it will be decided by first token(CLS)
        h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]

        # get masked position from final output of transformer.
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_clsf

In [176]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [177]:
from torch import optim


model = BERT()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [179]:
batch = make_batch(
    sentences=sentences, 
    batch_size=64, 
    token_list=token_list, 
    word_dict=word_dict, 
    number_dict=number_dict, 
    vocab_size=vocab_size,
    max_pred=10,
    maxlen=maxlen
)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

for epoch in range(100):
    optimizer.zero_grad()
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)  # for masked LM
    loss_lm = (loss_lm.float()).mean()
    loss_clsf = criterion(logits_clsf, isNext)  # for sentence classification
    loss = loss_lm + loss_clsf
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

# Predict mask tokens
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
print(text)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('predict masked tokens list : ', [pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ', True if logits_clsf else False)

Epoch: 0010 cost = 1.441227
Epoch: 0020 cost = 1.213921
Epoch: 0030 cost = 1.255225
Epoch: 0040 cost = 1.193672
Epoch: 0050 cost = 1.179239
Epoch: 0060 cost = 1.156764
Epoch: 0070 cost = 1.120726
Epoch: 0080 cost = 1.106389
Epoch: 0090 cost = 1.079798
Epoch: 0100 cost = 1.094847
Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thanks you Romeo
['[CLS]', 'hello', 'how', '[MASK]', 'you', 'i', '[MASK]', 'romeo', '[SEP]', 'thanks', 'you', 'romeo', '[SEP]']
masked tokens list :  [23, 17]
predict masked tokens list :  [20, 17]
isNext :  False
predict isNext :  True


## Practical classification with pre-trained BERT

Now that we're done studying how BERT training works, let us proceed by downloading a pre-trained BERT model from hugging face.

In [15]:
import transformers
import torch
from torch import nn


class BERTClassification(nn.Module):
    def __init__ (self):
        # for reference on bert outputs for classification see:
        # https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#bertmodel
        # https://stackoverflow.com/questions/61331991/bert-pooled-output-is-different-from-first-vector-of-sequence-output
        super(BERTClassification, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-cased')
        self.bert_dropout = nn.Dropout(p=0.4)
        self.classifier = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        sequence_output, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids, 
            return_dict=False
        )
        bert_with_dropout = self.bert_dropout(pooled_output)
        output = self.classifier(bert_with_dropout)
        
        return output

In [16]:
from torch import optim


model = BERTClassification()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
import pandas as pd


essays = pd.read_csv("./data/essays.csv")

essays.loc[essays['cEXT'] == 'n', 'cEXT'] = 0
essays.loc[essays['cEXT'] == 'y', 'cEXT'] = 1

essays.loc[essays['cNEU'] == 'n', 'cNEU'] = 0
essays.loc[essays['cNEU'] == 'y', 'cNEU'] = 1

essays.loc[essays['cAGR'] == 'n', 'cAGR'] = 0
essays.loc[essays['cAGR'] == 'y', 'cAGR'] = 1

essays.loc[essays['cCON'] == 'n', 'cCON'] = 0
essays.loc[essays['cCON'] == 'y', 'cCON'] = 1

essays.loc[essays['cOPN'] == 'n', 'cOPN'] = 0
essays.loc[essays['cOPN'] == 'y', 'cOPN'] = 1

essays.astype({'cEXT': 'int32', 'cNEU': 'int32', 'cAGR': 'int32', 'cCON': 'int32', 'cOPN': 'int32'}).dtypes

essays

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1
...,...,...,...,...,...,...,...
2462,2004_493.txt,I'm home. wanted to go to bed but remembe...,0,1,0,1,0
2463,2004_494.txt,Stream of consiousnesssskdj. How do you s...,1,1,0,0,1
2464,2004_497.txt,"It is Wednesday, December 8th and a lot has be...",0,0,1,0,0
2465,2004_498.txt,"Man this week has been hellish. Anyways, now i...",0,1,0,0,1


In [18]:
train_X, train_y, test_X, test_y = essays['TEXT'][:2000], essays['cEXT'][:2000], essays['TEXT'][2000:], essays['cEXT'][2000:]

In [6]:
from datasets import Dataset
from transformers import AutoTokenizer


essays_training_dataset = Dataset.from_pandas(essays[:2000])
essays_validation_dataset = Dataset.from_pandas(essays[2000:])
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", num_labels=2)

# def preprocess_function(examples):
#     return tokenizer(examples['TEXT'], truncation=True)

tokenized_essays = tokenizer(essays_training_dataset['TEXT'], padding=True, truncation=True, return_tensors="pt")  # essays_dataset.map(preprocess_function, batched=True)

In [351]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# automated training via Trainer; to be used later

training_args = TrainingArguments(
    output_dir="./models",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=essays_training_dataset,
    eval_dataset=essays_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [19]:
# I'm running this on Apple Silicon. Activate Metal "mps" device, if available:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")


torch.device("mps")
model.to(mps_device)

BERTClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [20]:
from tqdm.auto import tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split, default_convert
from datasets import Dataset
from transformers import get_scheduler
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["TEXT"], padding="max_length", truncation=True)  # , return_tensors="pt")

essays_dataset = Dataset.from_pandas(essays)
tokenized_dataset = essays_dataset.map(tokenize_function, batched=True, batch_size=32)

train_dataset, test_dataset = random_split(tokenized_dataset, [2000, len(tokenized_dataset) - 2000])

# train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
# test_dataset = test_dataset.shuffle(seed=42).select(range(1000))

tokenized_dataset = tokenized_dataset.rename_column("TEXT", "text")
tokenized_dataset = tokenized_dataset.rename_column("cNEU", "labels")

tokenized_dataset = tokenized_dataset.remove_columns(['#AUTHID', 'text', 'cEXT', 'cAGR', 'cCON', 'cOPN'])

train_dataloader = DataLoader(tokenized_dataset, shuffle=True, batch_size=32)

# parameters
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

# optimizer, scheduler, loss, etc.
optimizer = AdamW(model.parameters(), lr=5e-5)
cross_entropy_loss = nn.CrossEntropyLoss()

lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        print(batch.keys())
        labels = batch["labels"]
        del batch["labels"]
        batch = {k: torch.stack(default_convert(v)) for k, v in batch.items()}
        batch = {k: v.to(mps_device) for k, v in batch.items()}
        print(batch.keys())
        outputs = model(**batch)
        loss = cross_entropy_loss(outputs, labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|                                                                                                                                                                                                             | 0/78 [00:00<?, ?ba/s][A
 10%|████████████████████▏                                                                                                                                                                                | 8/78 [00:00<00:00, 78.54ba/s][A
 22%|██████████████████████████████████████████▋                                                                                                                                                         | 17/78 [00:00<00:00, 82.73ba/s][A
 33%|█████████████████████████████████████████████████████████████████▎                                                                                                                                  | 26/78 [00:00<00:00, 78.93ba/s][A
 44%|██████████████████████████████████████████████

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


ValueError: Expected input batch_size (512) to match target batch_size (32).