## TOC:
* [Accuracy Score](#first-bullet)
* [Second Bullet Header](#second-bullet)

In [14]:
import pandas as pd
import os
import json
import torch
from transformers import AutoTokenizer
from classes.TwitterNeuralNet import TwitterNeuralNet
from classes.TwitterDataset import TwitterDataset

from torchmetrics.functional import accuracy, f1_score, auroc
from sklearn.metrics import classification_report


In [4]:

with open('./config/config.json', 'r') as f:
    config = json.load(f)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print('---------LOADING MODEL---------')
PATH = './torch_model'
loaded_model = TwitterNeuralNet(bert_model_name=config['bert_model_name'])  # Task1->2 labels | Task2->3 Labels
loaded_model.load_state_dict(torch.load(
    os.path.join(PATH, config["trained_model_name"]), map_location=device))

loaded_model = loaded_model.to(device)
loaded_model.eval()
loaded_model.freeze()


cuda
---------LOADING MODEL---------


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
test_df = pd.read_csv('../../res/preprocessed/test_final.csv')
tokenizer = AutoTokenizer.from_pretrained(config['bert_model_name'])
test_dataset = TwitterDataset(
    test_df, tokenizer, max_token_len=config['max_token_len'])


In [6]:

LABEL_COLUMNS = list(test_df.columns)
LABEL_COLUMNS.remove('text')

TASK1_LABELS = LABEL_COLUMNS[:3]
TASK2_LABELS = LABEL_COLUMNS[3:4]
TASK3_LABELS = LABEL_COLUMNS[4:]


task1_id2label = {idx: label for idx, label in enumerate(TASK1_LABELS)}
task1_label2id = {label: idx for idx, label in enumerate(TASK1_LABELS)}

task2_label2id = {label: idx for idx, label in enumerate(TASK2_LABELS)}
task2_id2label = {idx: label for idx, label in enumerate(TASK2_LABELS)}

task3_label2id = {label: idx for idx, label in enumerate(TASK3_LABELS)}
task3_id2label = {idx: label for idx, label in enumerate(TASK3_LABELS)}


In [7]:
task1_predictions = []
task1_labels = []

task2_predictions = []
task2_labels = []

task3_predictions = []
task3_labels = []

predictions_labels_stacked_dict = {}

def get_accuracy_score():

    with torch.no_grad():
        for idx, item in enumerate(test_dataset):
            # if idx == 20:
            #     break
            _, prediction = loaded_model(
                item['input_ids'].unsqueeze(dim=0).to(device),
                item['attention_mask'].unsqueeze(dim=0).to(device)
            )

            task1_predictions.append(prediction[0].flatten())
            task2_predictions.append(prediction[1].flatten())
            task3_predictions.append(prediction[2].flatten())

            task1_labels.append(item['labels1'].int())
            task2_labels.append(item['labels2'].int())
            task3_labels.append(item['labels3'].int())

    task1_predictions_stacked = torch.stack(task1_predictions).detach().cpu()
    task1_labels_stacked = torch.stack(task1_labels).detach().cpu()
    predictions_labels_stacked_dict['task1_pred'] = task1_predictions_stacked
    predictions_labels_stacked_dict['task1_label'] = task1_labels_stacked

    task2_predictions_stacked = torch.stack(task2_predictions).detach().cpu()
    task2_labels_stacked = torch.stack(task2_labels).detach().cpu()
    predictions_labels_stacked_dict['task2_pred'] = task2_predictions_stacked
    predictions_labels_stacked_dict['task2_label'] = task2_labels_stacked

    task3_predictions_stacked = torch.stack(task3_predictions).detach().cpu()
    task3_labels_stacked = torch.stack(task3_labels).detach().cpu()
    predictions_labels_stacked_dict['task3_pred'] = task3_predictions_stacked
    predictions_labels_stacked_dict['task3_label'] = task3_labels_stacked

    THRESHOLD = 0.5

    task1_accuracy_score = accuracy(
        task1_predictions_stacked, task1_labels_stacked, threshold=THRESHOLD)

    task2_accuracy_score = accuracy(
        task2_predictions_stacked, task2_labels_stacked, threshold=THRESHOLD)

    task3_accuracy_score = accuracy(
        task3_predictions_stacked, task3_labels_stacked, threshold=THRESHOLD)

    print(f'task1_accuracy_score: {task1_accuracy_score}')
    print(f'task2_accuracy_score: {task2_accuracy_score}')
    print(f'task2_accuracy_score: {task3_accuracy_score}')
    
get_accuracy_score()


task1_accuracy_score: 0.774350106716156
task2_accuracy_score: 0.9362218379974365
task2_accuracy_score: 0.8890814781188965


In [9]:
print("AUROC per tag")
for i, name in enumerate(TASK1_LABELS):
  task1_auroc = auroc(predictions_labels_stacked_dict['task1_pred'][:,i].to(device), 
                      predictions_labels_stacked_dict['task1_label'][:, i].to(
      device), num_classes=len(TASK1_LABELS), pos_label=1)
  print(f"{name}: {task1_auroc}")


AUROC per tag
HATE: 0.8965604305267334
NOT: 0.824948787689209
OFFN: 0.7422533631324768


In [11]:
print("AUROC per tag")
for i, name in enumerate(TASK3_LABELS):
  task3_auroc = auroc(predictions_labels_stacked_dict['task3_pred'][:, i].to(device),
                      predictions_labels_stacked_dict['task3_label'][:, i].to(
      device), num_classes=len(TASK3_LABELS), pos_label=1)
  print(f"{name}: {task3_auroc}")


AUROC per tag
Race: 0.937118649482727
Religion: 0.9716429710388184
Gender: 0.9373893737792969
Other: 0.8594638109207153
None: 0.8242061734199524


In [16]:
print(classification_report(
    predictions_labels_stacked_dict['task3_pred'],
    predictions_labels_stacked_dict['task3_label'],
    target_names=TASK3_LABELS,
    zero_division=0
))


ValueError: Classification metrics can't handle a mix of continuous-multioutput and multilabel-indicator targets