In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score

# Step 1

In [2]:
classes = ['HAPPY', 'SAD', 'ANGRY', 'FEAR', 'SURPRISE', 'HATE', 'OTHER']
class2id = {classes[i]: i for i in range(len(classes))}
id2class = {i: classes[i] for i in range(len(classes))}

# Step 2

In [3]:
def class2onehot(classname):
    classid = class2id[classname]
    result = np.zeros(len(classes))
    result[classid] = 1
    return result

# Step 3

In [4]:
def preprocess_file(inpath, outpath):
    with open(inpath, 'r') as f:
        lines = f.readlines()
    texts = []
    labels = []
    ids = []

    for i, line in enumerate(lines):
        line = line.strip()
        text, classname = line.split('\t')
        id = i
        texts.append(text)
        labels.append(class2onehot(classname))
        ids.append(id)

    with open(outpath, 'w') as f:
        f.write('ID\tText\tLabel\n')
        for i in range(len(ids)):
            label_str = ','.join(map(str, labels[i]))
            f.write(f'{ids[i]}\t{texts[i]}\t{label_str}\n')


# Step 4

In [10]:
preprocess_file('./test.tsv', 'pptest.tsv')

dataset = pd.read_csv('pptest.tsv', sep='\t')

In [11]:
def onehot2class(onehot):
    return np.argmax(np.array(onehot.split(','), dtype=float))

dataset['Label'] = dataset['Label'].apply(onehot2class)
texts = dataset['Text'].tolist()
labels = dataset['Label'].tolist()

In [15]:
# Load the model and tokenizer
checkpoint = 'xlm-roberta-base'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def predict(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.cpu().numpy()


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [16]:
# Sample 100 data points for inference
sample_texts = texts[:100]
sample_labels = labels[:100]

# Perform inference
predictions = predict(sample_texts, model, tokenizer)

# Evaluate the results
acc = accuracy_score(sample_labels, predictions)
f1 = f1_score(sample_labels, predictions, average='weighted')

print(f"Accuracy: {acc}")
print(f"F1-score: {f1}")


Accuracy: 0.24
F1-score: 0.0929032258064516
