In [77]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score
from datasets import Dataset

In [80]:
# Load the saved model
model_path = "./bert-finetuned-sem_eval/checkpoint-2780/"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the test dataset from file.csv
test_df = pd.read_csv("./data_temp/test.csv").sample(frac=1).reset_index(drop=True)

test_data = Dataset.from_pandas(test_df)


In [90]:
def evaluate(model, test_inputs):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for index in range(len(test_inputs['text'])):
            label = test_inputs['label'][index]
            inputs = tokenizer(test_inputs['text'][index], padding="max_length", truncation=True, return_tensors="pt")
            logits = model(**inputs).logits
            predicted_class_id = logits.argmax().item()
            pred = model.config.id2label[predicted_class_id]
            all_preds.append(label)
            all_labels.append(pred)
    return all_preds, all_labels

preds, labels = evaluate(model, test_data)

In [91]:
accuracy = accuracy_score(labels, preds)
print("Accuracy:", accuracy)

Accuracy: 0.8371116708648195


In [7]:
text = "I'm happy I can finally train a model for multi-label text classification"
inputs = tokenizer(text, return_tensors="pt")

In [43]:
inputs

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 3407, 1045, 2064, 2633, 3345, 1037, 2944, 2005,
         4800, 1011, 3830, 3793, 5579,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
with torch.no_grad():
    logits = model(**inputs).logits

In [10]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'joy'