In [1]:
import os
import pandas as pd
from datasets import load_dataset
import random
import warnings
from datasets import Dataset, DatasetDict

warnings.simplefilter(action='ignore', category=FutureWarning)



Data Path

In [2]:
data_dir = os.path.join('dataset/')
lm_filepath = os.path.join(data_dir+'/augmented_data', 'LM.csv')
noise_filepath = os.path.join(data_dir+'/augmented_data', 'noise.csv')
thesaurus_filepath = os.path.join(data_dir+'/augmented_data', 'thesaurus.csv')

arguments_training_filepath = os.path.join(data_dir, 'arguments-training.tsv')
arguments_validation_filepath = os.path.join(data_dir, 'arguments-validation.tsv')
arguments_validation_filepath_zhihu = os.path.join(data_dir, 'arguments-validation-zhihu.tsv')
arguments_test_filepath = os.path.join(data_dir, 'arguments-test.tsv')

labels_training_filepath = os.path.join(data_dir, 'labels-training.tsv')
labels_validation_filepath = os.path.join(data_dir, 'labels-validation.tsv')
labels_validation_filepath_zhihu = os.path.join(data_dir, 'labels-validation-zhihu.tsv')
labels_test_filepath = os.path.join(data_dir, 'labels-test.tsv')


In [3]:
HIGHER_ORDER_VALUES = ['Opennes to change', 'Self-transcendence', 'Conservation', 'Self-enhancement']

HIGHER_ORDER_VALUES_AND_SUB = {'Opennes to change':['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism'], 
                       'Self-transcendence':['Humility', 'Benevolence: caring','Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity'], 
                       'Conservation':['Humility', 'Tradition','Conformity: interpresonal','Conformity: rules', 'Security: societal', 'Security: personal', 'Face'], 
                       'Self-enhancement':['Face', 'Power: dominance', 'Power: resources', 'Achievement', 'Hedonism']}


LABELS = ['Self-direction thought', 'Self-direction action', 'Stimulation', 'Hedonism', 'Achievement', 'Power dominance', 'Power resources', 'Face', 'Security personal', 'Security societal', 'Tradition', 'Conformity rules', 'Conformity interpersonal', 'Humility', 'Benevolence caring', 'Benevolence dependability', 'Universalism concern', 'Universalism nature', 'Universalism tolerance', 'Universalism objectivity']
PROMPT_FORMATS = ["The premise: '{}' is '{}'. The conclusion is '{}'\n. Question: Which value category does the argument belong to? Options: {} \n",
                  "Premise: {}\nStance: {}\nConclusion: {}. Value category: {}\n Question: Which value category does the argument belong to?\n",
                  "Argument: {}. {}. {}. Value category: {}\n Question: Which value category does the argument belong to?\n"]

ENSEMBLE_PROMPT = ["The premise '{}' is '{}'. The conclusion is '{}'. Which of the following higher order values does that support, there can be more than one option? Options: {}\n",
                   "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"]


def convert_binary_labels_to_string(df):
    label_names = df.columns[1:]
    labels = []

    for index, row in df.iterrows():
        binary_values = row.values[1:]
        string_labels = ''
        for i, value in enumerate(binary_values):
            if value == 1:
                string_labels += label_names[i] + ', '
        labels.append(string_labels[:-2])
    return labels

def ensemble_prompt_higher_order(df):
    """Creates an ensemble prompt for each argument with the first prompt format"""
    
    template = ENSEMBLE_PROMPT[0]
    prompts = [
                template.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(HIGHER_ORDER_VALUES))
                for _, row in df.iterrows()
    ]
    df['ensemble'] = prompts
    return df


def single_shot_prompt(df):
    """Creates a single shot prompt for each argument with the first prompt format"""
    
    template = PROMPT_FORMATS[0] # use the first template 
    prompts = [
                template.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
                for _, row in df.iterrows()
    ]
    df['single_shot_prompt'] = prompts
    return df

def few_shot_prompt(df, num_shots=1, prompt_format=0, random_seed=46):
    """Creates a few shot prompt for each argument"""

    prompt_format = PROMPT_FORMATS[prompt_format]
    
    selected_arguments = df.sample(n=num_shots, random_state=random_seed)
    few_shot_prompts = [
        # prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {random.choice(LABELS)}\n"
        prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: {', '.join(random.sample(LABELS, 2))}\n"
        for _, row in selected_arguments.iterrows()
    ]
    df['few_shot_prompt'] = df.apply(lambda row: ''.join(few_shot_prompts) + prompt_format.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS)) + f"Answer: \n", axis=1)
    return df


# used for testing different prompt formats 
def prompt_formats(df):
    prompts = [
        [
            prompt.format(row['Premise'], row['Stance'], row['Conclusion'], ', '.join(LABELS))
            for prompt in PROMPT_FORMATS
        ]
        for _, row in df.iterrows()
    ]

    df['prompt_formats'] = prompts
    return df

def labels_to_multi_choice():
    """Converts the labels to a multi choice format"""
    multi_choice_format = "{}: {}"
    multi_choice_options = []

    for index, label in enumerate(LABELS):
        multi_choice_option = multi_choice_format.format(chr(65 + index), label)
        multi_choice_options.append(multi_choice_option)

    return multi_choice_options

def label_to_vector(df):
    """Converts the labels to a vector"""
    label_names = df.iloc[:, 1:]
    return label_names.values.tolist()

def add_labels_to_augmented_data(augmented_df, original_df):
    """add the label_vector and label_string of the original data to the augmented data by matching the Argument ID"""
    augmented_df['label_vector'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_vector'])
    augmented_df['label_string'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_string'])
    return augmented_df
    # augmented_df['Labels'] = augmented_df['Argument ID'].map(original_df.set_index('Argument ID')['label_vector'])
    # return augmented_df



Load the data

In [4]:
train = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)
# train_augmented = pd.read_csv(arguments_training_filepath, encoding='utf-8', sep='\t', header=0)

validation = pd.read_csv(arguments_validation_filepath, encoding='utf-8', sep='\t', header=0)
validation_zhihu = pd.read_csv(arguments_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
test = pd.read_csv(arguments_test_filepath, encoding='utf-8', sep='\t', header=0)

labels_training = pd.read_csv(labels_training_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation = pd.read_csv(labels_validation_filepath, encoding='utf-8', sep='\t', header=0)
labels_validation_zhihu = pd.read_csv(labels_validation_filepath_zhihu, encoding='utf-8', sep='\t', header=0)
labels_test = pd.read_csv(labels_test_filepath, encoding='utf-8', sep='\t', header=0)


## add vector labels to the dataframes 

In [5]:
train['label_vector'] = label_to_vector(labels_training)
validation['label_vector'] = label_to_vector(labels_validation)
validation_zhihu['label_vector'] = label_to_vector(labels_validation_zhihu)
test['label_vector'] = label_to_vector(labels_test)
train.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,label_vector
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


## Combine the data and labels into a single dataframe

In [7]:
train['label_string'] = convert_binary_labels_to_string(labels_training)
validation['label_string'] = convert_binary_labels_to_string(labels_validation)
validation_zhihu['label_string'] = convert_binary_labels_to_string(labels_validation_zhihu)
test['label_string'] = convert_binary_labels_to_string(labels_test)

In [8]:
lm = pd.read_csv(lm_filepath)
noise = pd.read_csv(noise_filepath)
thesaurus = pd.read_csv(thesaurus_filepath)

lm = add_labels_to_augmented_data(lm, train)
noise = add_labels_to_augmented_data(noise, train)
thesaurus = add_labels_to_augmented_data(thesaurus, train)
# print lm where aurgment id is A20376
lm[lm['Argument ID'] == 'A20376']

Unnamed: 0.1,Unnamed: 0,Argument ID,Conclusion,Stance,Premise,label_vector,label_string
0,2030,A20376,Holocaust denial should be considered a crimin...,against,the holocaust is an unproven event and anyone ...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
265,2030,A20376,Holocaust denial should be a criminal offence,against,the holocaust is a clearly documented event an...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
1168,2030,A20376,Holocaust denial should be a criminal offence,against,anyone who denies its existence can be easily ...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
14297,2030,A20376,Holocaust denial should be a criminal offence.,against,the holocaust is clearly documented in writing...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
23285,2030,A20376,Holocaust denial should be a felony.,against,the holocaust is clearly documented. anyone wh...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
23302,2030,A20376,Holocaust denial should be considered a crimin...,against,the holocaust is a clear evidence of its exist...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
32842,2030,A20376,Holocaust denial should be a criminal offense.,against,the holocaust is a well documented event. anyo...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."
33015,2030,A20376,Holocaust denial should be a criminal offence.,against,the holocaust is clearly documented and anyone...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...","Self-direction: thought, Security: personal, H..."


In [None]:
train[train['Argument ID'] == 'A20376']

## Add higher level labels to dataframe

In [None]:
higher_order_values_df = pd.DataFrame(columns=['higher_order_value', 'value_category'])
for higher_order_value in HIGHER_ORDER_VALUES_AND_SUB:
    for value_category in HIGHER_ORDER_VALUES_AND_SUB[higher_order_value]:
        higher_order_values_df = higher_order_values_df.append({'higher_order_value': higher_order_value, 'value_category': value_category}, ignore_index=True)

# higher_order_values_df

In [None]:
# # add the higher order values to the labels of the training, validation and test set
# def add_higher_order_values(df):
#     df['higher_order_value'] = df['label_string'].apply(lambda x: higher_order_values_df[higher_order_values_df['value_category'].isin(x)]['higher_order_value'].unique())
#     return df

# train = add_higher_order_values(train)
# validation = add_higher_order_values(validation)
# validation_zhihu = add_higher_order_values(validation_zhihu)
# test = add_higher_order_values(test)

# train.head()

## Add promts to dataframe

In [None]:
train = single_shot_prompt(train)
train = few_shot_prompt(train, num_shots=1, prompt_format=0, random_seed=46)
train = ensemble_prompt_higher_order(train) # 
noise = single_shot_prompt(noise)

noise.head()

## Setup the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

## Select 10 random samples 

In [None]:
train_10 = train.sample(10)
train_10.iloc[0]['single_shot_prompt']

## Zero-shot

In [None]:
def query_from_list(query):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=20)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

for index, row in train_10.iterrows():
    result = query_from_list(row['single_shot_prompt'])
    print(f"Prompt: {row['single_shot_prompt']} \n Prediction: {result[0]}\n True Label: {row['label_string']}\n")
    print(60*'-')

## Few-shot

In [None]:
for index, row in train_10.iterrows():
    result = query_from_list(row['few_shot_prompt'])
    print(f"{row['few_shot_prompt']:<24} {result[0]}\n True Label: {row['label_string']}\n")
    # break

## Prompt ensemble for higher order values

In [None]:
for index, row in train_10.iterrows():
    result = query_from_list(row['ensemble'])
    print(f"Prompt1:\t{row['ensemble']} Prediction higher_order_value:\t {result[0]}\n True higher_order_value:\t{row['higher_order_value']}")
    # template = "The premise '{}' is '{}'. The conclusion is '{}'. This falls in the higher order value of '{}'. Which of the following value categories does that support? {}\n"
    template = ENSEMBLE_PROMPT[1]
    values = HIGHER_ORDER_VALUES_AND_SUB[result[0]]
    prompt = template.format(row['Premise'], row['Stance'], row['Conclusion'], result[0], values)
    result = query_from_list(prompt)
    print(f"Prompt2:\t {prompt}Predicted value category:\t {result[0]}\n True value category:\t {row['label_string']}\n")
    print(10*'-------------------')

In [None]:
pwd

In [13]:
from datasets import load_dataset, Dataset, load_from_disk
dataset_path = '../datasets/touche23_prompt'
dataset = load_from_disk(dataset_path)
dataset['augmented_lm'][0]


{'Unnamed: 0': 2030,
 'Argument ID': 'A20376',
 'Conclusion': 'Holocaust denial should be considered a criminal offence.',
 'Stance': 'against',
 'Premise': "the holocaust is an unproven event and anyone who denies it's existence can be easily disproven so money and time should not be wasted on criminalizing it and creating laws.",
 'label_vector': [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
 'label_string': 'Self-direction: thought, Security: personal, Humility, Universalism: objectivity',
 'single_shot_prompt': "The premise: 'the holocaust is an unproven event and anyone who denies it's existence can be easily disproven so money and time should not be wasted on criminalizing it and creating laws.' is 'against'. The conclusion is 'Holocaust denial should be considered a criminal offence.'\n. Question: Which value category does the argument belong to? Options: Self-direction thought, Self-direction action, Stimulation, Hedonism, Achievement, Power dominance, Power res