In [4]:
# File  : pretrainedbertontweets.py
# Author: Shawn Li Xiaoyin
# Date  : 16/4/21
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels = ['negative', 'neutral', 'positive']
refl = {'negative':-1,'neutral':0,'positive':1}
# mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
# with urllib.request.urlopen(mapping_link) as f:
#     html = f.read().decode('utf-8').split("\n")
#     csvreader = csv.reader(html, delimiter='\t')
# labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

# text = "Good night 😊"
# text = preprocess(text)
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)
# scores = output[0][0].detach().numpy()
# scores = softmax(scores)

# score = np.argmax(scores)


In [5]:
import numpy as np
import pandas as pd
df = pd.read_csv('vaccination_tweets.csv')
df

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1.340000e+18,Rachel Roh,"La Crescenta-Montrose, CA",Aggregator of Asian American news; scanning di...,8/4/09 17:52,405,1692,3247,False,20/12/20 6:06,Same folks said daikon paste could treat a cyt...,['PfizerBioNTech'],Twitter for Android,0,0,False
1,1.340000e+18,Albert Fong,"San Francisco, CA","Marketing dude, tech geek, heavy metal & '80s ...",21/9/09 15:27,834,666,178,False,13/12/20 16:27,While the world has been on the wrong side of ...,,Twitter Web App,1,1,False
2,1.340000e+18,eli🇱🇹🇪🇺👌,Your Bed,"heil, hydra 🖐☺",25/6/20 23:30,10,88,155,False,12/12/20 20:33,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf...",Twitter for Android,0,0,False
3,1.340000e+18,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",10/9/08 11:28,49165,3933,21853,True,12/12/20 20:23,"Facts are immutable, Senator, even when you're...",,Twitter Web App,446,2129,False
4,1.340000e+18,Citizen News Channel,,Citizen News Channel bringing you an alternati...,23/4/20 17:58,152,580,1473,False,12/12/20 20:17,Explain to me again why we need a vaccine @Bor...,"['whereareallthesickpeople', 'PfizerBioNTech']",Twitter for iPhone,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5611,1.360000e+18,Dr. Melvin Sanicas 🩺🔬,"Zurich, Switzerland",Physician Scientist FRSPH FRSA • #GlobalHealth...,19/7/17 15:21,33265,2009,19074,False,8/2/21 16:37,Neutralization geometric mean titers (GMTs) of...,['PfizerBiontech'],Twitter Web App,10,25,False
5612,1.360000e+18,Ed. K.,"Athens, Greece","Edward Katsikian - PT\nMSc, Sports Injuries an...",3/2/12 11:44,287,1857,3063,False,8/2/21 16:35,I just got my 1st dose of #PfizerBiontech #COV...,"['PfizerBiontech', 'COVID19vaccine', 'EndThePa...",Twitter for Android,0,0,False
5613,1.360000e+18,Nancy Nahrwold (Burke),"Sarasota, FL","⚡️How do you spell love? You don’t spell it, y...",7/9/17 0:22,8,100,1090,False,8/2/21 15:50,Fully vaccinated and high af. #PfizerBiontech ...,"['PfizerBiontech', 'COVID19', 'mondaythoughts']",Twitter for iPhone,0,0,False
5614,1.360000e+18,Yovka Dimitrova,Brussels,Journalist. European with Bulgarian mother ton...,3/8/10 17:43,360,786,4981,False,8/2/21 15:39,The planned shipment of #PfizerBiontech #vacci...,"['PfizerBiontech', 'vaccine', 'Germany']",Twitter for iPad,0,0,False


In [6]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [7]:
# df['text'] = pd.DataFrame(df1['text'].astype(str))
df['text_clean']=df['text'].apply(preprocess)

In [8]:
results = []
for i in df['text_clean']:
    encoded_input = tokenizer(i, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    score = np.argmax(scores)
    label = labels[score]
    results.append(refl[label])

In [9]:
df.insert(df.shape[1], 'prelabel', results)
df

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet,text_clean,prelabel
0,1.340000e+18,Rachel Roh,"La Crescenta-Montrose, CA",Aggregator of Asian American news; scanning di...,8/4/09 17:52,405,1692,3247,False,20/12/20 6:06,Same folks said daikon paste could treat a cyt...,['PfizerBioNTech'],Twitter for Android,0,0,False,Same folks said daikon paste could treat a cyt...,0
1,1.340000e+18,Albert Fong,"San Francisco, CA","Marketing dude, tech geek, heavy metal & '80s ...",21/9/09 15:27,834,666,178,False,13/12/20 16:27,While the world has been on the wrong side of ...,,Twitter Web App,1,1,False,While the world has been on the wrong side of ...,0
2,1.340000e+18,eli🇱🇹🇪🇺👌,Your Bed,"heil, hydra 🖐☺",25/6/20 23:30,10,88,155,False,12/12/20 20:33,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf...",Twitter for Android,0,0,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,0
3,1.340000e+18,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",10/9/08 11:28,49165,3933,21853,True,12/12/20 20:23,"Facts are immutable, Senator, even when you're...",,Twitter Web App,446,2129,False,"Facts are immutable, Senator, even when you're...",0
4,1.340000e+18,Citizen News Channel,,Citizen News Channel bringing you an alternati...,23/4/20 17:58,152,580,1473,False,12/12/20 20:17,Explain to me again why we need a vaccine @Bor...,"['whereareallthesickpeople', 'PfizerBioNTech']",Twitter for iPhone,0,0,False,Explain to me again why we need a vaccine @use...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5611,1.360000e+18,Dr. Melvin Sanicas 🩺🔬,"Zurich, Switzerland",Physician Scientist FRSPH FRSA • #GlobalHealth...,19/7/17 15:21,33265,2009,19074,False,8/2/21 16:37,Neutralization geometric mean titers (GMTs) of...,['PfizerBiontech'],Twitter Web App,10,25,False,Neutralization geometric mean titers (GMTs) of...,0
5612,1.360000e+18,Ed. K.,"Athens, Greece","Edward Katsikian - PT\nMSc, Sports Injuries an...",3/2/12 11:44,287,1857,3063,False,8/2/21 16:35,I just got my 1st dose of #PfizerBiontech #COV...,"['PfizerBiontech', 'COVID19vaccine', 'EndThePa...",Twitter for Android,0,0,False,I just got my 1st dose of #PfizerBiontech #COV...,1
5613,1.360000e+18,Nancy Nahrwold (Burke),"Sarasota, FL","⚡️How do you spell love? You don’t spell it, y...",7/9/17 0:22,8,100,1090,False,8/2/21 15:50,Fully vaccinated and high af. #PfizerBiontech ...,"['PfizerBiontech', 'COVID19', 'mondaythoughts']",Twitter for iPhone,0,0,False,Fully vaccinated and high af. #PfizerBiontech ...,1
5614,1.360000e+18,Yovka Dimitrova,Brussels,Journalist. European with Bulgarian mother ton...,3/8/10 17:43,360,786,4981,False,8/2/21 15:39,The planned shipment of #PfizerBiontech #vacci...,"['PfizerBiontech', 'vaccine', 'Germany']",Twitter for iPad,0,0,False,The planned shipment of #PfizerBiontech #vacci...,-1


In [10]:
df.to_csv("tweets_label.csv",index = False)