In [14]:
import os
import pandas as pd
from datasets import load_dataset
import ast
import random

## Load data from local files

Data PATH

In [15]:
data_dir = os.path.join('dataset')
arguments_training_filepath = os.path.join(data_dir, 'arguments-training.tsv')
arguments_validation_filepath = os.path.join(data_dir, 'arguments-validation.tsv')
arguments_validation_filepath_zhihu = os.path.join(data_dir, 'arguments-validation-zhihu.tsv')
arguments_test_filepath = os.path.join(data_dir, 'arguments-test.tsv')

labels_training_filepath = os.path.join(data_dir, 'labels-training.tsv')
labels_validation_filepath = os.path.join(data_dir, 'labels-validation.tsv')
labels_validation_filepath_zhihu = os.path.join(data_dir, 'labels-validation-zhihu.tsv')
labels_test_filepath = os.path.join(data_dir, 'labels-test.tsv')

Load the data

In [16]:
arguments_training = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)
arguments_validation = pd.read_csv(arguments_validation_filepath, encoding='utf-8', sep='\t', header=0)
arguments_validation_zhihu = pd.read_csv(arguments_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
arguments_test = pd.read_csv(arguments_test_filepath, encoding='utf-8', sep='\t', header=0)


labels_training = pd.read_csv(labels_training_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation = pd.read_csv(labels_validation_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation_zhihu = pd.read_csv(labels_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
labels_test = pd.read_csv(labels_test_filepath, encoding='utf-8', sep='\t', header=0)


print(arguments_training.iloc[0], '\n', labels_training.iloc[0])

Argument ID                                               A01001
Conclusion                        Entrapment should be legalized
Stance                                               in favor of
Premise        if entrapment can serve to more easily capture...
Name: 0, dtype: object 
 Argument ID                   A01001
Self-direction: thought            0
Self-direction: action             0
Stimulation                        0
Hedonism                           0
Achievement                        0
Power: dominance                   0
Power: resources                   0
Face                               0
Security: personal                 0
Security: societal                 1
Tradition                          0
Conformity: rules                  0
Conformity: interpersonal          0
Humility                           0
Benevolence: caring                0
Benevolence: dependability         0
Universalism: concern              0
Universalism: nature               0
Universalism

## Some helper functions

In [17]:
LABELS = ['Self-direction thought', 'Self-direction action', 'Stimulation', 'Hedonism', 'Achievement', 'Power dominance', 'Power resources', 'Face', 'Security personal', 'Security societal', 'Tradition', 'Conformity rules', 'Conformity interpersonal', 'Humility', 'Benevolence caring', 'Benevolence dependability', 'Universalism concern', 'Universalism nature', 'Universalism tolerance', 'Universalism objectivity']
PROMPT_FORMATS = ["The premise: '{}' is '{}'. The conclusion is '{}'. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Premise: {}\nStance: {}\nConclusion: {}. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Argument: {}. {}. {}. Value category: {}\n Question: Which value category does the argument belong to?\n"]

def convert_binary_labels_to_string(df):
    label_names = df.columns[1:]
    string_labels = []

    for index, row in df.iterrows():
        binary_values = row.values[1:]
        string_labels.append([label_names[i] for i, value in enumerate(binary_values) if value == 1])

    df['String Labels'] = string_labels
    return df

"""Create multiple prompts for each argument to see which one works best"""
def ensemble_prompt(df):
    prompts = [
        [
            prompt.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
            for prompt in PROMPT_FORMATS
        ]
        for _, row in df.iterrows()
    ]

    df['ensemble_prompt'] = prompts
    return df

def few_shot_prompt(df, num_shots=1, prompt_format=0, random_seed=42):
    """Creates a few shot prompt for each argument"""

    prompt_format = PROMPT_FORMATS[prompt_format]
    
    selected_arguments = df.sample(n=num_shots, random_state=random_seed)
    few_shot_prompts = [
        prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {random.choice(LABELS)}\n"
        for _, row in selected_arguments.iterrows()
    ]

    df['few_shot_prompt'] = df.apply(lambda row: ''.join(few_shot_prompts) + prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: \n", axis=1)
    return df

def combine_columns(df_arguments, df_labels):
    """Combines the two `DataFrames` on column `Argument ID`"""
    df_labels = df_labels[['Argument ID', 'String Labels']]
    df_labels.columns = ['Argument ID', 'Labels']

    return pd.merge(df_arguments, df_labels, on='Argument ID')

def labels_to_multi_choice():
    """Converts the labels to a multi choice format"""
    multi_choice_format = "{}: {}"
    multi_choice_options = []

    for index, label in enumerate(LABELS):
        multi_choice_option = multi_choice_format.format(chr(65 + index), label)
        multi_choice_options.append(multi_choice_option)

    return multi_choice_options


In [32]:
# Convert the binary labels to string labels and remove the columns 'Argument ID' and 'String Labels'
labels = labels_training.columns.unique()
labels = [label for label in labels if label != 'Argument ID' and label != 'String Labels']
labels = [label.replace(':', '') for label in labels]
multi_choice_options = labels_to_multi_choice()
print(multi_choice_options)

ensemble_prompt_df = ensemble_prompt(arguments_training)
ensemble_prompt_df['ensemble_prompt'][0]

['A: Self-direction thought', 'B: Self-direction action', 'C: Stimulation', 'D: Hedonism', 'E: Achievement', 'F: Power dominance', 'G: Power resources', 'H: Face', 'I: Security personal', 'J: Security societal', 'K: Tradition', 'L: Conformity rules', 'M: Conformity interpersonal', 'N: Humility', 'O: Benevolence caring', 'P: Benevolence dependability', 'Q: Universalism concern', 'R: Universalism nature', 'S: Universalism tolerance', 'T: Universalism objectivity']


["The premise: 'if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?' is 'in favor of'. The conclusion is 'Entrapment should be legalized'. Value category: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity\n Question: Which value category does the argument belong to?\n",
 "Premise: if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?\nStance: in favor of\nConclusion: Entrapment should be legalized. Value category: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conform

In [33]:
few_shot_promp_df = few_shot_prompt(arguments_training, num_shots=2, prompt_format=0)
few_shot_promp_df['few_shot_prompt'][0]

"The premise: 'the words &quot;he&quot; and &quot;she&quot; are a way of labeling human beings and not everyone feels like they fit into either of these categories. by doing away with gender specific words you will make everyone feel the same.' is 'in favor of'. The conclusion is 'We should adopt gender-neutral language'. Value category: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity\n Question: Which value category does the argument belong to?\nAnswer: Universalism tolerance\nThe premise: 'surrogacy is hugely emotionally damaging on the birth mother, and is often only resorted to as a result of financial hardship.' is 'in favor of'. The conclusion is 'Surrogacy should be b

In [36]:
converted_labels = convert_binary_labels_to_string(labels_training)
few_shot_promp_df = combine_columns(arguments_training, converted_labels)

# arguments_training['Prompt'].iloc[0]
few_shot_promp_df

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,ensemble_prompt,few_shot_prompt,Labels
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,[The premise: 'if entrapment can serve to more...,The premise: 'the words &quot;he&quot; and &qu...,[Security: societal]
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,[The premise: 'we should ban human cloning as ...,The premise: 'the words &quot;he&quot; and &qu...,[Security: societal]
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,[The premise: 'marriage is the ultimate commit...,The premise: 'the words &quot;he&quot; and &qu...,[Self-direction: action]
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,[The premise: 'it provides a useful income for...,The premise: 'the words &quot;he&quot; and &qu...,[Security: personal]
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,[The premise: 'fast food should be banned beca...,The premise: 'the words &quot;he&quot; and &qu...,[Security: personal]
...,...,...,...,...,...,...,...
5215,D27096,Nepotism exists in Bollywood,against,Star kids also have an upbringing which is sur...,[The premise: 'Star kids also have an upbringi...,The premise: 'the words &quot;he&quot; and &qu...,"[Self-direction: thought, Achievement, Face]"
5216,D27097,Nepotism exists in Bollywood,in favor of,Movie stars of Bollywood often launch their ch...,[The premise: 'Movie stars of Bollywood often ...,The premise: 'the words &quot;he&quot; and &qu...,[Benevolence: caring]
5217,D27098,India is safe for women,in favor of,Evil historic practices on women in the pre an...,[The premise: 'Evil historic practices on wome...,The premise: 'the words &quot;he&quot; and &qu...,"[Security: societal, Universalism: objectivity]"
5218,D27099,India is safe for women,in favor of,Women of our country have been and are achievi...,[The premise: 'Women of our country have been ...,The premise: 'the words &quot;he&quot; and &qu...,"[Achievement, Security: societal, Universalism..."


Combine the arguments and string labels into df

In [None]:
training_arg_labels.head()

## Load the data from disk

In [None]:
from datasets import load_dataset, Dataset, load_from_disk
dataset_path = 'dataset/processed/touche23'
dataset = load_from_disk(dataset_path)
train = dataset['train']
train[0]

## Setup the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

## Select 10 random samples 

In [None]:
test_arguments = training_arg_labels.sample(10)
test_arguments['Prompt'].iloc[0]

## Zero-shot Prompting

In [None]:
def query_from_list(query, options):
    # t5query = f"""Classify the following argument into  "{options}". Context: "{query}"""
    t5query = f"""Classify whether the argument, with the conclusion "{query}", draws on the value categories of {options}."""
    print(t5query)
    inputs = tokenizer(t5query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=20)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for index, row in test_arguments.iterrows():
    result = query_from_list([row['Conclusion']], labels)
    print(f"{row['Prompt']:<24} \n Prediction: {result[0]}\n True Label: {row['Labels']}\n")
    # break

## Few-shot classification promt

In [None]:
def query_from_list(query, options):
    t5query = f"""Classify the following argument into  "{options}". Context: "The premise "wikipedia would be more accurate if it was subsidized." is "in favor of" and the conclusion is "We should subsidize Wikipedia". Answer: ['Achievement', 'Benevolence: caring']\n Classify the following argument into  "{options}". Context: "{query}. Answer: """
    print(t5query)
    # t5query = f"""Classify the following argument into  "{options}". Context: "{query}. Answer:"""
    # print(t5query)
    inputs = tokenizer(t5query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=30)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
test_arguments_1 = test_arguments[1:]
for index, row in test_arguments_1.iterrows():
    result = query_from_list(row['Prompt'], labels)
    print(f"{row['Prompt']:<24} \n Prediction: {result[0]}\n True Label: {row['Labels']}\n")
    break

## Zero-shot classification with single label

In [None]:
def query_from_list(query, options):
    # t5query = f"""Given the argument with the conclusion '{query['Conclusion']}', stance '{query['Stance']}', and premise '{query['Premise']}', classify whether it draws on the value category of {options}. Answer: yes or no"""
    t5query = f"""Argument: {query['Prompt']}\n Prompt: Classify whether the argument draws on the {options} value category. Answer: Yes or No"""
    # print(t5query)
    inputs = tokenizer(t5query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=30)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for label in labels:
    result = query_from_list(test_arguments.iloc[0], label)
    print(f"{label} \n Prediction: {result[0]}\n True Label: {test_arguments.iloc[0]['Labels']}\n")
    # break