## Imports ##

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from torch import nn

## Creating list of texts for classification ##

In [2]:
df = pd.read_excel("austen_test_data.xlsx")

In [3]:
df.head()

Unnamed: 0,text,human_label
0,"Marianne was afraid of offending, and said no ...",mental
1,She would not wound the feelings of her sister...,mental
2,Marianne was rejoiced to find her sister so ea...,mental
3,She felt that Edward stood very high in her o...,mental
4,She believed the regard to be mutual; but she ...,mental


In [4]:
texts = df['text'].tolist()

In [5]:
print(texts)

['Marianne was afraid of offending, and said no more on the subject; but the kind of approbation which Elinor described as excited in him by the drawings of other people, was very far from that rapturous delight,\nwhich, in her opinion, could alone be called taste.', 'She would not wound the feelings of her sister on any account, and yet to say what she did not believe was impossible. ', 'Marianne was rejoiced to find her sister so easily pleased.', ' She felt that Edward stood very high in her opinion. ', 'She believed the regard to be mutual; but she required greater certainty of it to make Marianne’s conviction of their attachment agreeable to her.', 'Marianne was astonished to find how much the imagination of her mother and herself had outstripped the truth.', 'She could not consider her partiality for Edward in so prosperous a state as Marianne had believed it. ', 'Elinor had always thought it would be more prudent for them to settle at some distance from Norland, than immediately

## Loading up the classifier based on previously saved .pth file ##

In [13]:
#Creating a class for the classifier

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [15]:
#Setting variables (these need to match the existing fine-tuned classifier file--see classifier).

bert_model_name = 'bert-base-uncased'
num_classes = 2 
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

#This ensures the classifier computation will be performed on a GPU if there is a CUDA-enabled one available. 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [16]:
# Load the base bert model as previously defined

model = BERTClassifier(bert_model_name, num_classes)

#Update the base bert model with weights from previous fine-tuning, saved in .pth file

model.load_state_dict(torch.load("description_classifier/bert_classifier_1.pth"))

#'Moves' model to device. This is only necessary when utilizing GPUs on your machine. If this line is not included,
#the code will use the CPU by default.

model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## Actually applying the classifier ##

In [17]:
#Define a function for applying the classifier.

def predict_description(text, model, tokenizer, device, max_length=256):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "behaviour" if preds.item() == 1 else "mental"

In [18]:
predict_description(texts[0], model, tokenizer, device, max_length=256)

'mental'

In [26]:
results = []
for sentence in texts:
    classification = predict_description(sentence, model, tokenizer, device)
    results.append([sentence, classification])
    
df_results = pd.DataFrame(results, columns=['Sentence', 'BERT_label'])
    

In [27]:
df_results.head()

Unnamed: 0,Sentence,BERT_label
0,"Marianne was afraid of offending, and said no ...",mental
1,She would not wound the feelings of her sister...,mental
2,Marianne was rejoiced to find her sister so ea...,mental
3,She felt that Edward stood very high in her o...,mental
4,She believed the regard to be mutual; but she ...,mental
