# Stance evaluation

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from tqdm.notebook import tqdm

import pandas as pd
from pathlib import Path
import swifter

In [None]:
TOPICS = [
    "abortion",
    "atheism",
    "climate",
    "feminist",
    "hillary",
]

MAPPING = {
    0: "none",
    1: "against",
    2: "favor"
}


FILENAME = "Ewaluacja ChatGPT - zadania - TweetEval - stance detection.csv" # file csv from google sheet

In [None]:
def get_model(task: str):
    model_name = Path(f"cardiffnlp/twitter-roberta-base-stance-{task}")
    tokenizer_name = model_name / "tokenizer"
    
    if not tokenizer_name.exists():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(tokenizer_name)

    return model, tokenizer

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
def preprocess_and_tokenize(row, tokenizer):
    row = preprocess(row)
    return tokenizer(row, return_tensors='pt')

In [None]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
df = pd.read_csv(FILENAME)
df.head()

Unnamed: 0,id,prompt,text,annotation,taken,subtask,chatgpt_final,chatgpt_raw,comment
0,0,\nAssuming that you have to describe the stanc...,#mtp @user How is deleting emails -part of the...,1,1,hillary,1,1,
1,1,\nAssuming that you have to describe the stanc...,@user @user AndrewWhyDoYouCareAboutWhatIThink?...,1,1,hillary,0,0,
2,2,\nAssuming that you have to describe the stanc...,The white male vote is solidly GOP. The black ...,1,1,hillary,1,1,
3,3,\nAssuming that you have to describe the stanc...,@user big banker buds need to ratchet up their...,1,1,hillary,1,1,
4,4,\nAssuming that you have to describe the stanc...,@user Why should I believe you on this? The GO...,1,1,hillary,1,1,


In [None]:
df["tokenized"] = df["text"].swifter.apply(lambda r: preprocess_and_tokenize(r, tokenizer))
df

Pandas Apply:   0%|          | 0/1249 [00:00<?, ?it/s]

Unnamed: 0,id,prompt,text,annotation,taken,subtask,chatgpt_final,chatgpt_raw,comment,tokenized
0,0,\nAssuming that you have to describe the stanc...,#mtp @user How is deleting emails -part of the...,1,1,hillary,1,1,,"[input_ids, attention_mask]"
1,1,\nAssuming that you have to describe the stanc...,@user @user AndrewWhyDoYouCareAboutWhatIThink?...,1,1,hillary,0,0,,"[input_ids, attention_mask]"
2,2,\nAssuming that you have to describe the stanc...,The white male vote is solidly GOP. The black ...,1,1,hillary,1,1,,"[input_ids, attention_mask]"
3,3,\nAssuming that you have to describe the stanc...,@user big banker buds need to ratchet up their...,1,1,hillary,1,1,,"[input_ids, attention_mask]"
4,4,\nAssuming that you have to describe the stanc...,@user Why should I believe you on this? The GO...,1,1,hillary,1,1,,"[input_ids, attention_mask]"
...,...,...,...,...,...,...,...,...,...,...
1244,275,\nAssuming that you have to describe the stanc...,@user @user I followed him before I watched hu...,0,1,abortion,0,0,,"[input_ids, attention_mask]"
1245,276,\nAssuming that you have to describe the stanc...,"For he who avenges blood remembers, he does no...",1,1,abortion,0,0,,"[input_ids, attention_mask]"
1246,277,\nAssuming that you have to describe the stanc...,Life is sacred on all levels. Abortion does no...,1,1,abortion,1,1,,"[input_ids, attention_mask]"
1247,278,\nAssuming that you have to describe the stanc...,"@user U refer to ""WE"" which =""YOU"" & a minorit...",1,1,abortion,1,1,,"[input_ids, attention_mask]"


In [None]:
bert_results = {
    "text": [],
    "roberta": [],
    "confidence": [],
    "annotation": [],
    "subtask": []
}

for subtask in TOPICS:
    model, tokenizer = get_model(subtask)
    for id_, row in tqdm(df[df.subtask == subtask].iterrows()):
        encoded_input = row["tokenized"]
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]

        bert_results["roberta"].append(labels[ranking[0]])
        bert_results["confidence"].append(scores[ranking[0]])
        bert_results["subtask"].append(subtask)
        bert_results["text"].append(row["text"])
        bert_results["annotation"].append(row["annotation"])

0it [00:00, ?it/s]

Downloading config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

0it [00:00, ?it/s]

Downloading config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

0it [00:00, ?it/s]

Downloading config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
results = pd.DataFrame(bert_results)
results["roberta_final"] = results["roberta"].swifter.apply(lambda val: 0 if val == "none" else 1 if val == "against" else 2)

Pandas Apply:   0%|          | 0/1249 [00:00<?, ?it/s]

In [None]:
results

Unnamed: 0,text,roberta,confidence,annotation,subtask,roberta_final
0,Need a ProLife R.E. Agent? - Support a ProLife...,against,0.931517,1,abortion,1
1,Where is the childcare program @user which you...,against,0.509997,1,abortion,1
2,I get several requests with petitions to save ...,against,0.527835,1,abortion,1
3,"we must always see others as Christ sees us,we...",none,0.653959,1,abortion,0
4,PRAYERS FOR BABIES Urgent prayer one in Lexing...,against,0.869390,1,abortion,1
...,...,...,...,...,...,...
1244,.@HillaryClinton Looking 4ward 2 hearing your ...,against,0.973792,2,hillary,1
1245,@user I'm loving it too! Draw that contrast!!!...,none,0.612639,0,hillary,0
1246,"@user Can't stand @user anymore, but hope @use...",none,0.591747,2,hillary,0
1247,Hillary can't create jobs! Last time she had a...,against,0.979370,1,hillary,1


In [None]:
results.to_csv("RobertaBase-stance.csv", index=False)