In [1]:
import os
import pandas as pd
from datasets import load_dataset
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load data from local files

In [2]:
# # load json file
# data_dir = os.path.join('dataset')
# def load_json(file_path):
#     import json
#     with open(file_path, 'r') as f:
#         data = json.load(f)
#     return data

# values_path = os.path.join(data_dir, 'value-categories.json')
# values = load_json(values_path)
# values

Data PATH

In [3]:
data_dir = os.path.join('dataset')
arguments_training_filepath = os.path.join(data_dir, 'arguments-training.tsv')
arguments_validation_filepath = os.path.join(data_dir, 'arguments-validation.tsv')
arguments_validation_filepath_zhihu = os.path.join(data_dir, 'arguments-validation-zhihu.tsv')
arguments_test_filepath = os.path.join(data_dir, 'arguments-test.tsv')

labels_training_filepath = os.path.join(data_dir, 'labels-training.tsv')
labels_validation_filepath = os.path.join(data_dir, 'labels-validation.tsv')
labels_validation_filepath_zhihu = os.path.join(data_dir, 'labels-validation-zhihu.tsv')
labels_test_filepath = os.path.join(data_dir, 'labels-test.tsv')

Load the data

In [4]:
arguments_training = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)
arguments_validation = pd.read_csv(arguments_validation_filepath, encoding='utf-8', sep='\t', header=0)
arguments_validation_zhihu = pd.read_csv(arguments_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
arguments_test = pd.read_csv(arguments_test_filepath, encoding='utf-8', sep='\t', header=0)

labels_training = pd.read_csv(labels_training_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation = pd.read_csv(labels_validation_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation_zhihu = pd.read_csv(labels_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
labels_test = pd.read_csv(labels_test_filepath, encoding='utf-8', sep='\t', header=0)

arguments_training.head()


Unnamed: 0,Argument ID,Conclusion,Stance,Premise
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...
3,A01004,We should ban naturopathy,against,it provides a useful income for some people
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...


## Some helper functions

In [5]:
# higher_order_values = {'Opennes to change':['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism'], 
#                        'Self-transcendence':['Humility', 'Benevolence: caring','Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity'], 
#                        'Conservation':['Humility', 'Tradition','Conformity: interpresonal','Conformity: rules', 'Security: societal', 'Security: personal', 'Face'], 
#                        'Self-enhancement':['Face', 'Power: dominance', 'Power: resources', 'Achievement', 'Hedonism']}

In [6]:
HIGHER_ORDER_VALUES = ['Opennes to change', 'Self-transcendence', 'Conservation', 'Self-enhancement']

HIGHER_ORDER_VALUES_AND_SUB = {'Opennes to change':['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism'], 
                       'Self-transcendence':['Humility', 'Benevolence: caring','Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity'], 
                       'Conservation':['Humility', 'Tradition','Conformity: interpresonal','Conformity: rules', 'Security: societal', 'Security: personal', 'Face'], 
                       'Self-enhancement':['Face', 'Power: dominance', 'Power: resources', 'Achievement', 'Hedonism']}


LABELS = ['Self-direction thought', 'Self-direction action', 'Stimulation', 'Hedonism', 'Achievement', 'Power dominance', 'Power resources', 'Face', 'Security personal', 'Security societal', 'Tradition', 'Conformity rules', 'Conformity interpersonal', 'Humility', 'Benevolence caring', 'Benevolence dependability', 'Universalism concern', 'Universalism nature', 'Universalism tolerance', 'Universalism objectivity']
PROMPT_FORMATS = ["The premise: '{}' is '{}'. The conclusion is '{}'. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Premise: {}\nStance: {}\nConclusion: {}. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Argument: {}. {}. {}. Value category: {}\n Question: Which value category does the argument belong to?\n"]

ENSEMBLE_PROMPT = ["The premise '{}' is '{}'. The conclusion is '{}'. Which of the following higher order values does that support, there can be more than one option? Options: {}\n",
                   "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"]


def convert_binary_labels_to_string(df):
    label_names = df.columns[1:]
    string_labels = []

    for index, row in df.iterrows():
        binary_values = row.values[1:]
        string_labels.append([label_names[i] for i, value in enumerate(binary_values) if value == 1])

    df['Label'] = string_labels
    return df

def ensemble_prompt_higher_order(df):
    """Creates an ensemble prompt for each argument with the first prompt format"""
    
    template = ENSEMBLE_PROMPT[0]
    prompts = [
                template.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(HIGHER_ORDER_VALUES))
                for _, row in df.iterrows()
    ]
    df['higher_order_prompt'] = prompts
    return df


def prompt_formats(df):
    prompts = [
        [
            prompt.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
            for prompt in PROMPT_FORMATS
        ]
        for _, row in df.iterrows()
    ]

    df['prompt_formats'] = prompts
    return df

def few_shot_prompt(df, num_shots=1, prompt_format=0, random_seed=46):
    """Creates a few shot prompt for each argument"""

    prompt_format = PROMPT_FORMATS[prompt_format]
    
    selected_arguments = df.sample(n=num_shots, random_state=random_seed)
    few_shot_prompts = [
        prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {random.choice(LABELS)}\n"
        for _, row in selected_arguments.iterrows()
    ]
    df['few_shot_prompt'] = df.apply(lambda row: ''.join(few_shot_prompts) + prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: \n", axis=1)
    return df

def combine_columns(df_arguments, df_labels):
    """Combines the two `DataFrames` on column `Argument ID`"""
    df_labels = df_labels[['Argument ID', 'Label']]
    df_labels.columns = ['Argument ID', 'Label']

    return pd.merge(df_arguments, df_labels, on='Argument ID')

def labels_to_multi_choice():
    """Converts the labels to a multi choice format"""
    multi_choice_format = "{}: {}"
    multi_choice_options = []

    for index, label in enumerate(LABELS):
        multi_choice_option = multi_choice_format.format(chr(65 + index), label)
        multi_choice_options.append(multi_choice_option)

    return multi_choice_options


## Convert the binary labels

In [7]:
labels = labels_training.columns.unique()
labels = [label for label in labels if label != 'Argument ID' and label != 'String Labels']
# labels = [label.replace(':', '') for label in labels]

## Combine the data and labels into a single dataframe

In [8]:
converted_labels_train = convert_binary_labels_to_string(labels_training)
train = combine_columns(arguments_training, converted_labels_train)

converted_labels_validation = convert_binary_labels_to_string(labels_validation)
validation = combine_columns(arguments_validation, converted_labels_validation)

converted_labels_validation_zhihu = convert_binary_labels_to_string(labels_validation_zhihu)
validation_zhihu = combine_columns(arguments_validation_zhihu, converted_labels_validation_zhihu)

converted_labels_test = convert_binary_labels_to_string(labels_test)
test = combine_columns(arguments_test, converted_labels_test)

In [9]:
train.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Label
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,[Security: societal]
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,[Security: societal]
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,[Self-direction: action]
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,[Security: personal]
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,[Security: personal]


## Add higher level labels to dataframe

In [10]:
higher_order_values_df = pd.DataFrame(columns=['higher order value', 'value category'])
for higher_order_value in HIGHER_ORDER_VALUES_AND_SUB:
    for value_category in HIGHER_ORDER_VALUES_AND_SUB[higher_order_value]:
        higher_order_values_df = higher_order_values_df.append({'higher order value': higher_order_value, 'value category': value_category}, ignore_index=True)

# higher_order_values_df

In [11]:
# add the higher order values to the labels of the training, validation and test set
def add_higher_order_values(df):
    df['higher order value'] = df['Label'].apply(lambda x: higher_order_values_df[higher_order_values_df['value category'].isin(x)]['higher order value'].unique())
    return df

train = add_higher_order_values(train)
validation = add_higher_order_values(validation)
validation_zhihu = add_higher_order_values(validation_zhihu)
test = add_higher_order_values(test)

train.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Label,higher order value
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,[Security: societal],[Conservation]
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,[Security: societal],[Conservation]
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,[Self-direction: action],[Opennes to change]
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,[Security: personal],[Conservation]
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,[Security: personal],[Conservation]


## Load the data from disk

In [12]:
# from datasets import load_dataset, Dataset, load_from_disk
# dataset_path = 'dataset/processed/processed_dataset'
# dataset = load_from_disk(dataset_path)

## Setup the model

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

2023-05-23 18:01:14.527788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
train = prompt_formats(train)
train = few_shot_prompt(train, num_shots=1, prompt_format=0, random_seed=46)
train = ensemble_prompt_higher_order(train)

train.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Label,higher order value,prompt_formats,few_shot_prompt,higher_order_prompt
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,[Security: societal],[Conservation],[The premise: 'if entrapment can serve to more...,The premise: 'people should keep their religio...,The premise 'if entrapment can serve to more e...
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,[Security: societal],[Conservation],[The premise: 'we should ban human cloning as ...,The premise: 'people should keep their religio...,The premise 'we should ban human cloning as it...
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,[Self-direction: action],[Opennes to change],[The premise: 'marriage is the ultimate commit...,The premise: 'people should keep their religio...,The premise 'marriage is the ultimate commitme...
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,[Security: personal],[Conservation],[The premise: 'it provides a useful income for...,The premise: 'people should keep their religio...,The premise 'it provides a useful income for s...
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,[Security: personal],[Conservation],[The premise: 'fast food should be banned beca...,The premise: 'people should keep their religio...,The premise 'fast food should be banned becaus...


## Select 10 random samples 

In [15]:
train_10 = train.sample(10)
train_10

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Label,higher order value,prompt_formats,few_shot_prompt,higher_order_prompt
4263,A12154,We should ban targeted killing,against,"based on the reason for targeted killing, in s...","[Security: societal, Conformity: rules, Univer...","[Self-transcendence, Conservation]",[The premise: 'based on the reason for targete...,The premise: 'people should keep their religio...,The premise 'based on the reason for targeted ...
929,A18069,We should legalize cannabis,against,legalizing marijuana leads to more marijuana-r...,"[Security: personal, Conformity: rules, Benevo...","[Self-transcendence, Conservation]",[The premise: 'legalizing marijuana leads to m...,The premise: 'people should keep their religio...,The premise 'legalizing marijuana leads to mor...
1828,A20134,We should legalize cannabis,in favor of,cannabis should be legalized because no link h...,"[Self-direction: thought, Self-direction: acti...","[Opennes to change, Self-transcendence, Conser...",[The premise: 'cannabis should be legalized be...,The premise: 'people should keep their religio...,The premise 'cannabis should be legalized beca...
514,A12157,We should ban the use of child actors,in favor of,child actors miss out on socialization with ki...,"[Security: personal, Benevolence: dependability]","[Self-transcendence, Conservation]",[The premise: 'child actors miss out on social...,The premise: 'people should keep their religio...,The premise 'child actors miss out on socializ...
2740,A22227,We should legalize cannabis,in favor of,cannabis is a holistic and natural cure for ca...,"[Achievement, Security: personal, Universalism...","[Self-transcendence, Conservation, Self-enhanc...",[The premise: 'cannabis is a holistic and natu...,The premise: 'people should keep their religio...,The premise 'cannabis is a holistic and natura...
2159,A21037,Intelligence tests bring more harm than good,against,intelligence test help students end up in clas...,"[Self-direction: thought, Achievement, Conform...","[Opennes to change, Self-transcendence, Conser...",[The premise: 'intelligence test help students...,The premise: 'people should keep their religio...,The premise 'intelligence test help students e...
1494,A19245,We should abolish zoos,in favor of,zoos are entrapping animals that are meant to ...,"[Benevolence: caring, Universalism: nature]",[Self-transcendence],[The premise: 'zoos are entrapping animals tha...,The premise: 'people should keep their religio...,The premise 'zoos are entrapping animals that ...
5214,D27095,Nepotism exists in Bollywood,in favor of,The standards of Indian cinemas fall at times ...,[Achievement],[Self-enhancement],[The premise: 'The standards of Indian cinemas...,The premise: 'people should keep their religio...,The premise 'The standards of Indian cinemas f...
4037,A25263,We should abandon marriage,against,Marriage often serves both culturally and reli...,"[Security: personal, Tradition]",[Conservation],[The premise: 'Marriage often serves both cult...,The premise: 'people should keep their religio...,The premise 'Marriage often serves both cultur...
1126,A18301,We should adopt gender-neutral language,in favor of,gender neutral language will allow both gender...,"[Power: dominance, Face, Universalism: concern...","[Self-transcendence, Conservation, Self-enhanc...",[The premise: 'gender neutral language will al...,The premise: 'people should keep their religio...,The premise 'gender neutral language will allo...


## Zero-shot Prompting - prompt_format column

In [16]:
def query_from_list(query):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=20)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for index, row in train_10.iterrows():
    for index, prompt in enumerate(row['prompt_formats']):
        result = query_from_list(prompt)
        print(f"Template: {index} \n Prediction: {result[0]}\n True Label: {row['Label']}\n")
    break

Template: 0 
 Prediction: Self-direction action
 True Label: ['Security: societal', 'Conformity: rules', 'Universalism: concern', 'Universalism: objectivity']

Template: 1 
 Prediction: Value category
 True Label: ['Security: societal', 'Conformity: rules', 'Universalism: concern', 'Universalism: objectivity']

Template: 2 
 Prediction: Value category
 True Label: ['Security: societal', 'Conformity: rules', 'Universalism: concern', 'Universalism: objectivity']



## Few-shot classification promt

In [17]:
def query_from_list(query):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=30)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for index, row in train_10.iterrows():
    result = query_from_list(row['few_shot_prompt'])
    print(f"{row['few_shot_prompt']:<24} {result[0]}\n True Label: {row['Label']}\n")
    break

The premise: 'people should keep their religion to themselves' is 'in favor of'. The conclusion is 'We should ban missionary work'. Value category: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity
 Question: Which value category does the argument belong to?
Answer: Power dominance
The premise: 'based on the reason for targeted killing, in some cases this does make sense.  if a person is a serial murderer, and he is caught in gun fire, this type of killing of the murderer is warranted.' is 'against'. The conclusion is 'We should ban targeted killing'. Value category: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resou

## Prompt ensemble for higher order values

In [21]:
for index, row in train_10.iterrows():
    result = query_from_list(row['higher_order_prompt'])
    print(f"Prediction higher_order_value:\t {result[0]}\n True higher_order_value:\t{row['higher order value']}")
    # template = "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"
    template = ENSEMBLE_PROMPT[1]
    values = HIGHER_ORDER_VALUES_AND_SUB[result[0]]
    prompts = template.format(row['Premise'], row['Stance'], row['Conclusion'], result[0], values)
    result = query_from_list(prompts)
    print(f"Predicted value category:\t {result[0]}\n True value category:\t {row['Label']}\n")
    print(10*'-------------------')



Prediction higher_order_value:	 Self-transcendence
 True higher_order_value:	['Self-transcendence' 'Conservation']
Predicted value category:	 Universalism: objectivity
 True value category:	 ['Security: societal', 'Conformity: rules', 'Universalism: concern', 'Universalism: objectivity']

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Prediction higher_order_value:	 Self-transcendence
 True higher_order_value:	['Self-transcendence' 'Conservation']
Predicted value category:	 Universalism: objectivity
 True value category:	 ['Security: personal', 'Conformity: rules', 'Benevolence: caring', 'Benevolence: dependability']

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Prediction higher_order_value:	 Self-tr