# Tweeteval stance detection

In [2]:
import pandas as pd
from pathlib import Path
import swifter
from scipy.stats import entropy
from collections import Counter

In [3]:
!git clone https://github.com/cardiffnlp/tweeteval.git

/bin/bash: /home/oliwier/anaconda3/envs/representation/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Cloning into 'tweeteval'...
remote: Enumerating objects: 370, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 370 (delta 13), reused 1 (delta 1), pack-reused 354[K
Receiving objects: 100% (370/370), 8.49 MiB | 12.01 MiB/s, done.
Resolving deltas: 100% (122/122), done.


In [4]:
DATA_DIR = Path("tweeteval") / "datasets"
STANCE_DIR = DATA_DIR / "stance"
GPT_INPUT_PATH = Path("gpt-input")

GPT_INPUT_PATH.mkdir(exist_ok=True)

In [5]:
TOPICS = [
    "abortion",
    "atheism",
    "climate",
    "feminist",
    "hillary",
]

TOPIC_MAP = {
    "abortion": "abortion",
    "atheism": "atheism",
    "feminist": "feminism",
    "hillary": "hillary",
    "climate": "climate"
}

MAPPING = {
    0: "none",
    1: "against",
    2: "favor"
}

RESULTS = ["0", "1", "2"]

## Structure

In [6]:
def load_test_data(dataset_path: Path):
    df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
    df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])
    return pd.concat([df_text, df_labels], axis=1)

In [7]:
df = load_test_data(STANCE_DIR / "hillary")
df

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Unnamed: 0,text,labels
0,#mtp @user How is deleting emails -part of the...,1
1,@user @user AndrewWhyDoYouCareAboutWhatIThink?...,1
2,The white male vote is solidly GOP. The black ...,1
3,@user big banker buds need to ratchet up their...,1
4,@user Why should I believe you on this? The GO...,1
...,...,...
290,.@HillaryClinton Looking 4ward 2 hearing your ...,2
291,@user I'm loving it too! Draw that contrast!!!...,0
292,"@user Can't stand @user anymore, but hope @use...",2
293,Hillary can't create jobs! Last time she had a...,1


In [8]:
entropy(list(Counter(df.labels).values())) # dataset entropy

0.9531036596165436

## Prompts preparation

In [10]:
MAIN_STORY = """
Assuming that you have to describe the stance from the given list which is evoked after 
reading the text about {} by majority of people, which one would you select? 
Map your answer with following none:  0, against: 1, favor: 2. Do not explain yourself.
Input data:

Text: {}

Possible stances: {}

"""

In [11]:
def process_dataframe(df: pd.DataFrame, output_path: str, topic: str):
    final_answer = ""
    for id_, row in df.iterrows():
        final_answer += MAIN_STORY.format(topic, row["text"], ", ".join(MAPPING.values()))

    with open(output_path, "w") as output_file:
        output_file.write(final_answer)

In [12]:
def generate_dataframe(df: pd.DataFrame, topic: str):
    df["prompt"] = df["text"].swifter.apply(lambda row: MAIN_STORY.format(topic, row, ", ".join(MAPPING.values())))
    df = df.drop(["text", "labels"], axis = 1)
    df = df.reset_index(names="id")
    return df
                                            

In [18]:
for topic, topic_name in TOPIC_MAP.items():
    new_df = generate_dataframe(load_test_data(STANCE_DIR / topic), topic_name)
    new_df.to_csv(GPT_INPUT_PATH / f"{topic}.csv", index=False)

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Pandas Apply:   0%|          | 0/280 [00:00<?, ?it/s]

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Pandas Apply:   0%|          | 0/220 [00:00<?, ?it/s]

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Pandas Apply:   0%|          | 0/285 [00:00<?, ?it/s]

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Pandas Apply:   0%|          | 0/295 [00:00<?, ?it/s]

  df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
  df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])


Pandas Apply:   0%|          | 0/169 [00:00<?, ?it/s]