In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [3]:

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]



Downloading: 100%|█████████████████████████████████████████████████████████████████████| 747/747 [00:00<00:00, 374kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 878k/878k [00:00<00:00, 1.72MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 446k/446k [00:00<00:00, 2.35MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 75.2kB/s]


In [14]:

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)


In [47]:
from datasets import load_dataset

dataset = load_dataset("tweet_eval", "sentiment", split="test")

Found cached dataset tweet_eval (C:/Users/jdh/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [48]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 12284
})

In [60]:
LEN = 12284

from tqdm import tqdm
scores_lt = []
for text in tqdm(dataset['text'][:LEN]):
    text = preprocess(text)    
    encoded_input = tokenizer(text, return_tensors='pt')

    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_lt.append(scores)


100%|████████████████████████████████████████████████████████████████████████████| 12284/12284 [15:07<00:00, 13.53it/s]


In [61]:
prediction = np.argmax(scores_lt, axis=1)
accuracy = np.mean(prediction == dataset['label'][:LEN])

In [62]:
# dataset['label']
accuracy

0.7244382937154021

In [67]:
y_true_lt = dataset['label']
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(y_true_lt, prediction, average='macro')
recall = recall_score(y_true_lt, prediction, average='macro')
f1 = f1_score(y_true_lt, prediction, average='macro')

print(f'accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1: {f1}')


accuracy: 0.7244382937154021, precision: 0.7220946539955357, recall: 0.7275378975399002, f1: 0.7239042159699788


In [85]:
from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt
import pandas as pd
confusion_matrix = confusion_matrix(y_true_lt, prediction)
confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
confusion_matrix = pd.DataFrame(confusion_matrix, index=range(3), columns=range(3))
plt.figure(figsize=(3, 3))
sns.heatmap(confusion_matrix, annot=True, cmap='Blues')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(self.output_dir.joinpath('confusion_matrix.jpg'))
plt.show()

NameError: name 'plt' is not defined

In [88]:
from sklearn.metrics import confusion_matrix        
# import matplotlib.pyplot as plt

confusion_matrix = confusion_matrix(y_true_lt, prediction)
print(confusion_matrix)
plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, range(10), rotation=45)
plt.yticks(tick_marks, range(10))
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
plt.savefig('confusion_matrix.jpg')
plt.close()

[[3115  805   52]
 [1224 4105  608]
 [  54  642 1679]]


NameError: name 'plt' is not defined

In [36]:
text = "So bad 😊. @hihi. Hello. Tokenizer. xtyiouj. bkl;nhtc"
text = preprocess(text)
print(text)
# encoded_input2 = tokenizer(text, return_tensors='pt')
encoded_input = tokenizer(text, return_tensors='pt')
# print(encoded_input2)
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)


So bad 😊. @user Hello. Tokenizer. xtyiouj. bkl;nhtc


In [37]:

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) negative 0.4509
2) neutral 0.436
3) positive 0.1131
