In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels = ['negative', 'neutral', 'positive']
refl = {'negative':-1,'neutral':0,'positive':1}
# mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
# with urllib.request.urlopen(mapping_link) as f:
#     html = f.read().decode('utf-8').split("\n")
#     csvreader = csv.reader(html, delimiter='\t')
# labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

# text = "Good night 😊"
# text = preprocess(text)
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)
# scores = output[0][0].detach().numpy()
# scores = softmax(scores)

# score = np.argmax(scores)


In [2]:
import numpy as np
import pandas as pd
df1 = pd.read_csv('Finished_vaccination_tweets_手动标记.csv',encoding='ISO-8859-1', error_bad_lines=False)
df1 = df1[(df1['code-attitude'] == '0') | (df1['code-attitude'] == '1') | (df1['code-attitude'] == '-1')]
df1['text']

2       Facts are immutable, Senator, even when you're...
3       Explain to me again why we need a vaccine @Bor...
4       Does anyone have any useful advice/guidance fo...
5       it is a bit sad to claim the fame for success ...
6       There have not been many bright days in 2020 b...
                              ...                        
3101    #PublicHealth #COVID19 #Modernavaccine #Modern...
3102    @crashoverrideee #COVID19 Vaccine Update for #...
3103    Dr Fun's I Feel Good - My effort at a PSA, ple...
3104    @Writer_DG I got dose #2 Friday. Have to admit...
3105    Feeling very privileged to have had my first c...
Name: text, Length: 247, dtype: object

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [4]:
df1['text'] = pd.DataFrame(df1['text'].astype(str))
df1['text_clean']=df1['text'].apply(preprocess)
df1['code-attitude'] = pd.DataFrame(df1['code-attitude'].astype(int))

In [5]:
results = []
for i in df1['text_clean']:
    encoded_input = tokenizer(i, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    score = np.argmax(scores)
    label = labels[score]
    results.append(refl[label])

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy:{}".format(accuracy_score(df1['code-attitude'], results)))

Accuracy:0.8218623481781376


In [7]:
df1.insert(df1.shape[1], 'truelabel', results)

In [8]:
df1

Unnamed: 0,code-attitude,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,text_clean,truelabel
2,0,1.34E+18,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",2010/9/8 11:28,49165,3933,21853,TRUE,...,,,,,,,,,"Facts are immutable, Senator, even when you're...",0
3,0,1.34E+18,Citizen News Channel,,Citizen News Channel bringing you an alternati...,2023/4/20 17:58,152,580,1473,FALSE,...,,,,,,,,,Explain to me again why we need a vaccine @use...,0
4,0,1.34E+18,Dee,"Birmingham, England","Gastroenterology trainee, Clinical Research Fe...",2026/1/20 21:43,105,108,106,FALSE,...,,,,,,,,,Does anyone have any useful advice/guidance fo...,0
5,-1,1.34E+18,Gunther Fehlinger,"Austria, Ukraine and Kosovo",End North Stream 2 now - the pipeline of corru...,2010/6/13 17:49,2731,5001,69344,FALSE,...,,,,,,,,,it is a bit sad to claim the fame for success ...,-1
6,1,1.34E+18,Dr.Krutika Kuppalli,,"ID, Global Health, VHF, Pandemic Prep, Emergin...",2025/3/19 4:14,21924,593,7815,TRUE,...,,,,,,,,,There have not been many bright days in 2020 b...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3101,0,1.35E+18,Tracey Muhammad,,"Believer, Sister",2018/9/12 2:46,195,599,2935,FALSE,...,,,,,,,,,#PublicHealth #COVID19 #Modernavaccine #Modern...,0
3102,0,1.35E+18,Canadian Advocacy Centre 4 Health Safety & Jus...,Canada,#CACHSJ advocates for #civilrights #humanright...,2010/9/20 19:03,606,2050,735,FALSE,...,,,,,,,,,@user #COVID19 Vaccine Update for #PfizerBioNT...,0
3103,1,1.35E+18,Dr. Fun,"Alabama, USA","Peds cardiologist, medical educator, wife, mom...",2021/1/17 0:03,239,299,3033,FALSE,...,,,,,,,,,"Dr Fun's I Feel Good - My effort at a PSA, ple...",1
3104,1,1.35E+18,Rest of the Dream,The Rose Garden,"Chronic insomniac, causer of bedlam, fixer of ...",2029/4/9 13:06,274,648,6107,FALSE,...,,,,,,,,,@user I got dose #2 Friday. Have to admit I fe...,0


In [9]:
# df1.to_csv('compare.csv',index=False)