In [180]:
!pwd

/Users/claudiac/Library/CloudStorage/OneDrive-WashingtonUniversityinSt.Louis/social_behavior_classifier


In [181]:
!ls

HDW_classifier_testing-Copy1.ipynb     [34mcorpora-sentences[m[m
[31mHDW_classifier_testing.ipynb[m[m           [34mdescription_classifier[m[m
[31mHDW_final_training.csv[m[m                 [31mprepping_training_data.ipynb[m[m
HDW_final_training.xlsx                [31mrealism_bibliography.xlsx[m[m
[31mUntitled-laptop’s MacBook Air.ipynb[m[m    [31mrealism_bibliography_instructions.docx[m[m
[31mannotation tags.docx[m[m                   test1.csv
[31mapplying_fine-tuned_classifier.ipynb[m[m   test2_accuracy_scores.docx
austen_test_data.xlsx                  [34mtraining data[m[m
[34mclassifier_testing[m[m


## Setting test name and hyperparameters ##

In [182]:
test_name = "test4"

#Test hyperparameters
num_classes = 3
max_length = 512
bert_model_name = 'bert-base-uncased'
num_epochs = 5
learning_rate = 2e-5
batch_size = 16

## Imports ##

In [183]:
import os

In [184]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset


In [185]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

## Prepping training data ##

In [188]:
original_df = pd.read_excel("training data/classifier_annotations/2way_cross_validated.xlsx")
    
original_df.head()

Unnamed: 0.1,Unnamed: 0,text,eric_category,lucia_category,sada_category,filename_x,sub-categories,embedded?,notes,interaction?,...,? / Secondary,Interaction,Embedded,filename_y,subcategory,interactive,embedded,Questionable,filename,Unnamed: 13
0,0,That evening Mr. Utterson came home to his bac...,mental,mental,,stevenson-jekyl-1886.txt,emotion,,,,...,,,,,,,,,,
1,1,It offended him both as a lawyer and as a love...,mental,mental,,stevenson-jekyl-1886.txt,emotion,,,,...,,,,,,,,,,
2,2,It was a night of little ease to his toiling m...,mental,mental,,stevenson-jekyl-1886.txt,emotion; thought,,,,...,,,,,,,,,,
3,4,But his fear was only momentary.,mental,mental,,stevenson-jekyl-1886.txt,emotion,,,,...,,,,,,,,,,
4,5,"Cried Mr. Hyde, with a flush of anger.",mental,mental,,stevenson-jekyl-1886.txt,emotion,,,,...,,,,,,,,,,


In [189]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6412 entries, 0 to 6411
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      6412 non-null   int64 
 1   text            6412 non-null   object
 2   eric_category   6409 non-null   object
 3   lucia_category  6410 non-null   object
 4   sada_category   2770 non-null   object
 5   filename_x      1660 non-null   object
 6   sub-categories  1660 non-null   object
 7   embedded?       51 non-null     object
 8   notes           50 non-null     object
 9   interaction?    138 non-null    object
 10  Unnamed: 4      4 non-null      object
 11  Subcategory     1368 non-null   object
 12  ? / Secondary   141 non-null    object
 13  Interaction     21 non-null     object
 14  Embedded        8 non-null      object
 15  filename_y      1888 non-null   object
 16  subcategory     2308 non-null   object
 17  interactive     77 non-null     object
 18  embedded

In [190]:
#Finding number of texts from which current training data is taken

a = original_df['filename'].nunique(dropna=True)
b = original_df['filename_x'].nunique(dropna=True)
c = original_df['filename_y'].nunique(dropna=True)
no_of_texts = a+b+c
print(no_of_texts)

25


In [191]:
#Creating a cleaned up dataframe for training

index1 = []

for index, row in original_df.iterrows():
    values = [row['eric_category'], row['lucia_category'], row['sada_category']]
    common_label = None
    
    # Find the label that appears at least twice
    for value in set(values):
        if values.count(value) >= 2:
            common_label = value
            break
    
    # If a common label is found, save it and the 'text' column to the list
    if common_label:
        index1.append({'text': row['text'], 'category': common_label})

# Create a new dataframe from the list of rows
df = pd.DataFrame(index1)

In [192]:
df.head()

Unnamed: 0,text,category
0,That evening Mr. Utterson came home to his bac...,mental
1,It offended him both as a lawyer and as a love...,mental
2,It was a night of little ease to his toiling m...,mental
3,But his fear was only momentary.,mental
4,"Cried Mr. Hyde, with a flush of anger.",mental


In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6412 entries, 0 to 6411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      6412 non-null   object
 1   category  6412 non-null   object
dtypes: object(2)
memory usage: 100.3+ KB


In [194]:
#Sanity check

df['category'].unique()

array(['mental', 'behavior', 'other'], dtype=object)

In [195]:
#Getting number of training samples for each class

count_behavior = (df['category'] == 'behavior').sum()
count_mental = (df['category'] == 'mental').sum()
count_other = (df['category'] == 'other').sum()


In [196]:
# Creating lists of training texts and numeric designations for classes

texts = df['text'].tolist()

designation_numeric = []

#df['designation']

for x in df['category']:
    if x == 'other':
        designation_numeric.append(0)
    elif x == 'mental':
        designation_numeric.append(1)
    elif x == 'behavior':
        designation_numeric.append(2)
    else:
        continue

In [197]:
labels = torch.tensor(designation_numeric)
type(labels)

torch.Tensor

In [200]:
#Sanity check
len(texts)


6412

In [199]:
len(labels)

6412

## Setting up classes and functions for classifier ##

In [169]:
#Creating a class (object and set of associate functions) to store the training data in a certain structure,
# and also query the training and output data. Class consists of the input texts, their integer labels, the BERT
# tokenizer used to prep the data for feeding to the classifier, and the max input length the model will take. 
#This class is a child class of the Pytorch "torch.utils.data.Dataset" parent/base class
#Sentences longer than the max input length will be truncated and the remainder discarded!

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [170]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [171]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [172]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [173]:
def predict_description(text, model, tokenizer, device, max_length):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    label_map = {0: "null", 1: "mental", 2: "behaviour"}
    return label_map[preds.item()]


In [174]:
# Set variables to save results. 

file_name = test_name + "_accuracy_scores.docx"
classifier_name = "bert_classifier_" + test_name + ".pth"
subdirectory = "classifier_testing"
file_path = os.path.join(subdirectory, file_name)
classifier_path = os.path.join(subdirectory, classifier_name)

## Assigning variables for classifier training ##

In [175]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [176]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



## Actually training the classifier ##

In [177]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    if epoch == num_epochs - 1:
        final_accuracy = accuracy
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)
    
    ##UPDATE FILE NAME FOR EACH TEST!
    with open(file_path, 'a') as f:
        f.write(report)
        
        

    

  return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


Epoch 1/5
Validation Accuracy: 0.9174
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       298
           1       0.90      0.95      0.93       425
           2       0.94      0.91      0.92       560

    accuracy                           0.92      1283
   macro avg       0.91      0.91      0.91      1283
weighted avg       0.92      0.92      0.92      1283

Epoch 2/5


  return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


Validation Accuracy: 0.9127
              precision    recall  f1-score   support

           0       0.98      0.80      0.88       298
           1       0.89      0.95      0.92       425
           2       0.90      0.95      0.92       560

    accuracy                           0.91      1283
   macro avg       0.92      0.90      0.91      1283
weighted avg       0.92      0.91      0.91      1283

Epoch 3/5


  return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


Validation Accuracy: 0.9236
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       298
           1       0.92      0.92      0.92       425
           2       0.93      0.94      0.93       560

    accuracy                           0.92      1283
   macro avg       0.92      0.92      0.92      1283
weighted avg       0.92      0.92      0.92      1283

Epoch 4/5


  return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


Validation Accuracy: 0.9252
              precision    recall  f1-score   support

           0       0.92      0.91      0.92       298
           1       0.90      0.94      0.92       425
           2       0.95      0.92      0.93       560

    accuracy                           0.93      1283
   macro avg       0.92      0.92      0.92      1283
weighted avg       0.93      0.93      0.93      1283

Epoch 5/5


  return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


Validation Accuracy: 0.9260
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       298
           1       0.91      0.93      0.92       425
           2       0.94      0.93      0.93       560

    accuracy                           0.93      1283
   macro avg       0.93      0.92      0.92      1283
weighted avg       0.93      0.93      0.93      1283



## Saving classifier and hyperparameters ##

In [178]:
torch.save(model.state_dict(), classifier_path)

In [2]:
from openpyxl import load_workbook

file_path = 'classifier_testing/classifier_testing_parameters.xlsx'

# Load the workbook and select a sheet
workbook = load_workbook(filename=file_path)
sheet = workbook.active

type(sheet)

openpyxl.worksheet.worksheet.Worksheet

In [None]:
# Define the new data to be added
training_size = len(labels)
data = [[classifier_name, file_name, training_size, count_behavior, count_mental, count_other, bert_model_name, num_epochs, learning_rate, batch_size, max_length, final_accuracy, no_of_texts]]

# Find the next empty row in the sheet
next_row = sheet.max_row + 1

# Add the new data to the sheet
for row in data:
    sheet.append(row)

# Save the workbook
workbook.save(filename=file_path)

print(f"Data has been added to {file_path}")