# Before Running This Code

If you are running this code in Google Colab Pro, train on the A100 GPU, L4 GPU or T4 GPU runtime. The A100 is the best but costs most, the L4 is second best but costs more than the T4, the T4 is worst but costs least

In [None]:
importance_constant = 5 # How much more do you value true positives over true negatives
random_state = 3141 # Set to ensure consistency of results

In [None]:
########## Imports ##########

import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import json
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from google.colab import drive, runtime
import random

from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForTokenClassification

########## Ensure consistency ##########

torch.manual_seed(random_state)
np.random.seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)
random.seed(random_state)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
########## Mount Google Drive ##########

# This will open a new tab and ask you to give Google Colab permission to read and write files in your Google Drive.

drive.mount('/content/drive')

In [None]:
########## Load Data ##########

with open(f"/content/drive/My Drive/Capstone Project/data/a_true_father___labelling_system_1.json", "r") as file:
    a_true_father_sermon_labelling_system_1 = json.load(file)

with open(f"/content/drive/My Drive/Capstone Project/data/pride_is_your_enemy___labelling_system_1.json", "r") as file:
    pride_is_your_enemy_sermon_labelling_system_1 = json.load(file)

data_labelling_system_1 = a_true_father_sermon_labelling_system_1 + pride_is_your_enemy_sermon_labelling_system_1

with open(f"/content/drive/My Drive/Capstone Project/data/a_true_father___labelling_system_2.json", "r") as file:
    a_true_father_sermon_labelling_system_2 = json.load(file)

with open(f"/content/drive/My Drive/Capstone Project/data/pride_is_your_enemy___labelling_system_2.json", "r") as file:
    pride_is_your_enemy_sermon_labelling_system_2 = json.load(file)

data_labelling_system_2 = a_true_father_sermon_labelling_system_2 + pride_is_your_enemy_sermon_labelling_system_2

In [None]:
########## Define classes and functions ##########

class SermonDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        for json_item in data:
            encoding = self.tokenizer(
                json_item["words"],
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors="pt"
            )

            labels = json_item["labels"] + [0] * (self.max_length - len(json_item["labels"]))
            labels = torch.tensor(labels)
            encoding["labels"] = labels

            item = {key: val.squeeze(0) for key, val in encoding.items()}

            self.data.append(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    preds = [pred for pred, label in zip(preds.flatten(), labels.flatten())]
    labels = [label for label in labels.flatten()]

    cm = confusion_matrix(labels, preds)

    true_negatives = cm[0][0]
    false_positives = cm[0][1]
    false_negatives = cm[1][0]
    true_positives = cm[1][1]

    total_actual_negatives = true_negatives + false_positives
    total_actual_positives = true_positives + false_negatives

    higher_category_size = max(importance_constant * total_actual_positives, total_actual_negatives)
    total = total_actual_negatives + total_actual_positives

    accuracy = (true_positives + true_negatives) / (total)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)

    weighted_normalised_accuracy = max(((importance_constant * true_positives) + true_negatives - higher_category_size) / ((importance_constant * total_actual_positives) + total_actual_negatives - higher_category_size), 0)

    return {
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'true_positives': true_positives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'weighted_normalised_accuracy': weighted_normalised_accuracy
    }

class RobertaLargeBibleQuoteRecognition():
    def __init__(self):
        self.tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
        self.config = RobertaConfig.from_pretrained('roberta-large', num_labels=2)
        self.model = RobertaForTokenClassification.from_pretrained('roberta-large', config=self.config)

    def train(self, data, learning_rate, dropout_prob, batch_size, num_train_epochs, train_size, compute_metrics):
        self.data = data
        self.learning_rate = learning_rate
        self.dropout_prob = dropout_prob
        self.batch_size = batch_size
        self.num_train_epochs = num_train_epochs
        self.train_size = train_size

        self.model.config.hidden_dropout_prob = self.dropout_prob
        self.model.config.attention_probs_dropout_prob = self.dropout_prob

        self.data_collator = DataCollatorForTokenClassification(self.tokenizer)

        self.train_data, self.val_data = train_test_split(self.data, train_size=self.train_size, random_state=random_state) # random_state set for consistency
        self.train_dataset = SermonDataset(self.train_data, self.tokenizer)
        self.val_dataset = SermonDataset(self.val_data, self.tokenizer)

      ########## Define training arguments ##########

        self.training_args = TrainingArguments(
            learning_rate = self.learning_rate,
            output_dir='./results',
            num_train_epochs=self.num_train_epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="weighted_normalised_accuracy"
        )

        ########## Instantiate trainer ##########

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            data_collator=self.data_collator,
            compute_metrics=compute_metrics
        )

        ########## Move to GPU to increase training speed ##########

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        ########## Train model ##########

        self.trainer.train()

        ########## Evaluate model ##########

        log_history = self.trainer.state.log_history
        log_history.pop() # Remove last entry from loss_history, it isn't relevant
        log_history = [entry for entry in log_history if (entry['epoch'] % 1 == 0)] # Remove entires with non-integer epoch values

        return log_history

    def predict(self, texts):
        self.model.eval()

        encoded_inputs = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.train_dataset.max_length,
            return_tensors="pt"
        )

        with torch.no_grad():
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Move tensors to the same device as the model
            encoded_inputs = {key: val.to(device) for key, val in encoded_inputs.items()}

            outputs = self.model(**encoded_inputs)

        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

        return predictions.tolist()

In [None]:
########## Hyperparameter Tuning ##########

########## Hyperparameter options ##########

data_labelling_system_options = [1, 2]
learning_rate_options = [3e-5, 6e-5]
dropout_prob_options = [0.1, 0.25]
batch_size_options = [8, 16]
num_train_epochs_options = [15] # I am going to fix this because I personally don't have the computational resources to experiment any more in a reasonable timeframe.

########## Evaluations ##########

all_results = []

for data_labelling_system in data_labelling_system_options:
	for learning_rate in learning_rate_options:
		for dropout_prob in dropout_prob_options:
			for batch_size in batch_size_options:
				for num_train_epochs in num_train_epochs_options:
					if data_labelling_system == 1:
						data = data_labelling_system_1
					else:
						data = data_labelling_system_2

					model = RobertaLargeBibleQuoteRecognition()
					log_history = model.train(data, learning_rate, dropout_prob, batch_size, num_train_epochs, 0.8, compute_metrics)

					result = {
						'hyperparameters': {
							'data_labelling_system': data_labelling_system,
							'learning_rate': learning_rate,
							'dropout_prob': dropout_prob,
							'batch_size': batch_size,
							'num_train_epochs': num_train_epochs
						},
						'results': log_history
					}

					all_results.append(result)

print(all_results)

In [None]:
########## Save Results ##########

with open("../results/roberta_large_uncased.json", "w") as file:
  	json.dump(all_results, file)