In [4]:
import os
import pandas as pd
from datasets import load_dataset
import random
import warnings
from datasets import Dataset, DatasetDict

warnings.simplefilter(action='ignore', category=FutureWarning)



Data Path

In [5]:
data_dir = os.path.join('dataset')
arguments_training_filepath = os.path.join(data_dir, 'arguments-training.tsv')
arguments_validation_filepath = os.path.join(data_dir, 'arguments-validation.tsv')
arguments_validation_filepath_zhihu = os.path.join(data_dir, 'arguments-validation-zhihu.tsv')
arguments_test_filepath = os.path.join(data_dir, 'arguments-test.tsv')

labels_training_filepath = os.path.join(data_dir, 'labels-training.tsv')
labels_validation_filepath = os.path.join(data_dir, 'labels-validation.tsv')
labels_validation_filepath_zhihu = os.path.join(data_dir, 'labels-validation-zhihu.tsv')
labels_test_filepath = os.path.join(data_dir, 'labels-test.tsv')


In [6]:
HIGHER_ORDER_VALUES = ['Opennes to change', 'Self-transcendence', 'Conservation', 'Self-enhancement']

HIGHER_ORDER_VALUES_AND_SUB = {'Opennes to change':['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism'], 
                       'Self-transcendence':['Humility', 'Benevolence: caring','Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity'], 
                       'Conservation':['Humility', 'Tradition','Conformity: interpresonal','Conformity: rules', 'Security: societal', 'Security: personal', 'Face'], 
                       'Self-enhancement':['Face', 'Power: dominance', 'Power: resources', 'Achievement', 'Hedonism']}


LABELS = ['Self-direction thought', 'Self-direction action', 'Stimulation', 'Hedonism', 'Achievement', 'Power dominance', 'Power resources', 'Face', 'Security personal', 'Security societal', 'Tradition', 'Conformity rules', 'Conformity interpersonal', 'Humility', 'Benevolence caring', 'Benevolence dependability', 'Universalism concern', 'Universalism nature', 'Universalism tolerance', 'Universalism objectivity']
PROMPT_FORMATS = ["The premise: '{}' is '{}'. The conclusion is '{}'\n. Question: Which value category does the argument belong to? Options: {} \n",
                  "Premise: {}\nStance: {}\nConclusion: {}. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Argument: {}. {}. {}. Value category: {}\n Question: Which value category does the argument belong to?\n"]

ENSEMBLE_PROMPT = ["The premise '{}' is '{}'. The conclusion is '{}'. Which of the following higher order values does that support, there can be more than one option? Options: {}\n",
                   "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"]


def convert_binary_labels_to_string(df):
    label_names = df.columns[1:]
    labels = []

    for index, row in df.iterrows():
        binary_values = row.values[1:]
        string_labels = ''
        for i, value in enumerate(binary_values):
            if value == 1:
                string_labels += label_names[i] + ', '
        labels.append(string_labels[:-2])
    return labels

def ensemble_prompt_higher_order(df):
    """Creates an ensemble prompt for each argument with the first prompt format"""
    
    template = ENSEMBLE_PROMPT[0]
    prompts = [
                template.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(HIGHER_ORDER_VALUES))
                for _, row in df.iterrows()
    ]
    df['ensemble'] = prompts
    return df


def single_shot_prompt(df):
    """Creates a single shot prompt for each argument with the first prompt format"""
    
    template = PROMPT_FORMATS[0] # use the first template 
    prompts = [
                template.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
                for _, row in df.iterrows()
    ]
    df['single_shot_prompt'] = prompts
    return df

def few_shot_prompt(df, num_shots=1, prompt_format=0, random_seed=46):
    """Creates a few shot prompt for each argument"""

    prompt_format = PROMPT_FORMATS[prompt_format]
    
    selected_arguments = df.sample(n=num_shots, random_state=random_seed)
    few_shot_prompts = [
        # prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {random.choice(LABELS)}\n"
        prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {', '.join(random.sample(LABELS, 2))}\n"
        for _, row in selected_arguments.iterrows()
    ]
    df['few_shot_prompt'] = df.apply(lambda row: ''.join(few_shot_prompts) + prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: \n", axis=1)
    return df


# used for testing different prompt formats 
def prompt_formats(df):
    prompts = [
        [
            prompt.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
            for prompt in PROMPT_FORMATS
        ]
        for _, row in df.iterrows()
    ]

    df['prompt_formats'] = prompts
    return df

def labels_to_multi_choice():
    """Converts the labels to a multi choice format"""
    multi_choice_format = "{}: {}"
    multi_choice_options = []

    for index, label in enumerate(LABELS):
        multi_choice_option = multi_choice_format.format(chr(65 + index), label)
        multi_choice_options.append(multi_choice_option)

    return multi_choice_options

def label_to_vector(df):
    """Converts the labels to a vector"""
    label_names = df.iloc[:, 1:]
    return label_names.values.tolist()

def add_labels_to_augmented_data(augmented_df, original_df):
    """add the label_vector and label_string of the original data to the augmented data by matching the Argument ID"""
    augmented_df['label_vector'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_vector'])
    augmented_df['label_string'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_string'])
    return augmented_df
    # augmented_df['Labels'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_vector'])
    # return augmented_df



Load the data

In [7]:
train = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)
train_augmented = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)

validation = pd.read_csv(arguments_validation_filepath, encoding='utf-8', sep='\t', header=0)
validation_zhihu = pd.read_csv(arguments_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
test = pd.read_csv(arguments_test_filepath, encoding='utf-8', sep='\t', header=0)

labels_training = pd.read_csv(labels_training_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation = pd.read_csv(labels_validation_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation_zhihu = pd.read_csv(labels_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
labels_test = pd.read_csv(labels_test_filepath, encoding='utf-8', sep='\t', header=0)


## add vector labels to the dataframes 

In [8]:
train['label_vector'] = label_to_vector(labels_training)
validation['label_vector'] = label_to_vector(labels_validation)
validation_zhihu['label_vector'] = label_to_vector(labels_validation_zhihu)
test['label_vector'] = label_to_vector(labels_test)
train.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,label_vector
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


## Combine the data and labels into a single dataframe

In [9]:
train['label_string'] = convert_binary_labels_to_string(labels_training)
validation['label_string'] = convert_binary_labels_to_string(labels_validation)
validation_zhihu['label_string'] = convert_binary_labels_to_string(labels_validation_zhihu)
test['label_string'] = convert_binary_labels_to_string(labels_test)

# train.iloc[7]['label_string']
train.iloc[:20]

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,label_vector,label_string
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",Security: societal
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",Security: societal
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Self-direction: action
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Security: personal
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Security: personal
5,A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","Power: dominance, Security: societal"
6,A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...","Security: societal, Conformity: rules, Univers..."
7,A01008,We should ban factory farming,against,factory farming allows for the production of c...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...","Security: personal, Benevolence: caring, Unive..."
8,A01009,We should fight for the abolition of nuclear w...,against,nuclear weapons help keep the peace in uncerta...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","Security: societal, Universalism: concern"
9,A01010,We should prohibit school prayer,against,it should be allowed if the student wants to p...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","Self-direction: thought, Self-direction: actio..."


In [10]:
# testing augmenting the data

## Add higher level labels to dataframe

In [11]:
higher_order_values_df = pd.DataFrame(columns=['higher_order_value', 'value_category'])
for higher_order_value in HIGHER_ORDER_VALUES_AND_SUB:
    for value_category in HIGHER_ORDER_VALUES_AND_SUB[higher_order_value]:
        higher_order_values_df = higher_order_values_df.append({'higher_order_value': higher_order_value, 'value_category': value_category}, ignore_index=True)

# higher_order_values_df

In [12]:
# # add the higher order values to the labels of the training, validation and test set
# def add_higher_order_values(df):
#     df['higher_order_value'] = df['label_string'].apply(lambda x: higher_order_values_df[higher_order_values_df['value_category'].isin(x)]['higher_order_value'].unique())
#     return df

# train = add_higher_order_values(train)
# validation = add_higher_order_values(validation)
# validation_zhihu = add_higher_order_values(validation_zhihu)
# test = add_higher_order_values(test)

# train.head()

## Add promts to dataframe

In [13]:
train = single_shot_prompt(train)
train = few_shot_prompt(train, num_shots=1, prompt_format=0, random_seed=46)
train = ensemble_prompt_higher_order(train) # 

train.iloc[0]['few_shot_prompt']

"The premise: 'people should keep their religion to themselves' is 'in favor of'. The conclusion is 'We should ban missionary work'\n. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity \nAnswer: Power dominance, Power resources\nThe premise: 'if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?' is 'in favor of'. The conclusion is 'Entrapment should be legalized'\n. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security pe

## Setup the model

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

2023-05-30 15:31:45.495419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Select 10 random samples 

In [19]:
train_10 = train.sample(10)
train_10.iloc[0]['single_shot_prompt']

"The premise: 'Legalizing marijuana would lead to a reduction in gang-related drug violence.' is 'in favor of'. The conclusion is 'We should legalize cannabis'\n. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity \n"

## Zero-shot

In [16]:
def query_from_list(query):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=20)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for index, row in train_10.iterrows():
    result = query_from_list(row['single_shot_prompt'])
    print(f"Prompt: {row['single_shot_prompt']} \n Prediction: {result[0]}\n True Label: {row['label_string']}\n")
    print(60*'-')

Prompt: The premise: 'it's cruel for the animals to be caged' is 'in favor of'. The conclusion is 'We should abolish zoos'
. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity 
 
 Prediction: Universalism nature
 True Label: Conformity: rules, Universalism: nature

------------------------------------------------------------
Prompt: The premise: 'a zero tolerance policy is unfair for minimal infractions.' is 'against'. The conclusion is 'We should adopt a zero-tolerance policy in schools'
. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimu

## Few-shot

In [17]:
for index, row in train_10.iterrows():
    result = query_from_list(row['few_shot_prompt'])
    print(f"{row['few_shot_prompt']:<24} {result[0]}\n True Label: {row['label_string']}\n")
    # break

The premise: 'people should keep their religion to themselves' is 'in favor of'. The conclusion is 'We should ban missionary work'
. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity interpersonal, Humility, Benevolence caring, Benevolence dependability, Universalism concern, Universalism nature, Universalism tolerance, Universalism objectivity 
Answer: Power dominance, Power resources
The premise: 'it's cruel for the animals to be caged' is 'in favor of'. The conclusion is 'We should abolish zoos'
. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power resources, Face, Security personal, Security societal, Tradition, Conformity rules, Conformity inte

## Prompt ensemble for higher order values

In [18]:
for index, row in train_10.iterrows():
    result = query_from_list(row['ensemble'])
    print(f"Prompt1:\t{row['ensemble']} Prediction higher_order_value:\t {result[0]}\n True higher_order_value:\t{row['higher_order_value']}")
    # template = "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"
    template = ENSEMBLE_PROMPT[1]
    values = HIGHER_ORDER_VALUES_AND_SUB[result[0]]
    prompt = template.format(row['Premise'], row['Stance'], row['Conclusion'], result[0], values)
    result = query_from_list(prompt)
    print(f"Prompt2:\t {prompt}Predicted value category:\t {result[0]}\n True value category:\t {row['label_string']}\n")
    print(10*'-------------------')

KeyError: 'higher_order_value'

In [None]:
pwd

In [21]:
from datasets import load_dataset, Dataset, load_from_disk
dataset_path = '../datasets/touche23_prompt'
dataset = load_from_disk(dataset_path)
dataset


DatasetDict({
    train: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'label_vector', 'label_string', 'single_shot_prompt', 'few_shot_prompt'],
        num_rows: 5220
    })
    validation: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'label_vector', 'label_string', 'single_shot_prompt', 'few_shot_prompt'],
        num_rows: 1896
    })
    validation_zhihu: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'label_vector', 'label_string', 'single_shot_prompt', 'few_shot_prompt'],
        num_rows: 100
    })
    test: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'label_vector', 'label_string', 'single_shot_prompt', 'few_shot_prompt'],
        num_rows: 1576
    })
})