In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://figshare.com/ndownloader/files/7038038 -O wikidetox_aggression_texts.tsv
!wget https://figshare.com/ndownloader/files/7394506 -O wikidetox_aggression_annotations.tsv

--2023-01-30 15:46:47--  https://figshare.com/ndownloader/files/7038038
Resolving figshare.com (figshare.com)... 52.51.33.13, 34.241.99.189, 2a05:d018:1f4:d000:a2bd:ffac:4123:16e9, ...
Connecting to figshare.com (figshare.com)|52.51.33.13|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/7038038/aggression_annotated_comments.tsv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20230130/eu-west-1/s3/aws4_request&X-Amz-Date=20230130T154647Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=6b076ceb72c6a2a7375bb7abef342ba285d75f61174750ddc8da3c936359e1f9 [following]
--2023-01-30 15:46:48--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/7038038/aggression_annotated_comments.tsv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20230130/eu-west-1/s3/aws4_request&X-Amz-Date=20230130T154647Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=6b076c

In [3]:
texts_df = pd.read_csv("wikidetox_aggression_texts.tsv", sep='\t')
annotations_df = pd.read_csv("wikidetox_aggression_annotations.tsv", sep='\t')

In [4]:
data = pd.merge(annotations_df, texts_df, how='inner', on = 'rev_id')

data['comment'] = data['comment'].str.replace(r'NEWLINE_TOKEN', '')
data['comment'] = data['comment'].str.replace(r'==', '')

In [5]:
ann_columns = data.columns.tolist()[2:3]

In [6]:
', '.join(ann_columns)

'aggression'

In [7]:
for col in ann_columns:
    data[col] = data[col].astype(int)

In [8]:
np.random.seed(22)

selected_texts = data[data['split'] == 'test']['comment'].drop_duplicates().sample(n=1000).tolist()
selected_data = data.loc[data['comment'].isin(selected_texts)].sample(n=1000)
not_selected_data = data.loc[~data['comment'].isin(selected_texts)]

In [9]:
mean_stds = not_selected_data.groupby('comment')[ann_columns].std().reset_index().rename(columns={'aggression': 'std'})
not_selected_data = pd.merge(not_selected_data, mean_stds, on='comment')

In [10]:
def get_personalized_prompt(rows):
    pattern = "Knowing that a certain person described each text with a given attribute:\n\n"
    labels = rows['labels'].apply(lambda x: ', '.join(x))
    comments = rows['comment']

    return pattern + '\n'.join(("Text: " + comments + '; Attribute: ' + '"' + labels + '"').to_list())

def get_prompt(text, worker_id):
    not_selected_worker_annotations = not_selected_data.loc[not_selected_data.worker_id == worker_id]
    not_selected_worker_annotations = not_selected_worker_annotations.sort_values(by='std', ascending=False)[:3]

    not_selected_worker_annotations['labels'] = not_selected_worker_annotations.apply(get_labels, axis=1)

    personalized_part = get_personalized_prompt(not_selected_worker_annotations)
    prompt = f'\n\nwrite which one of the attributes: "aggressive", "non-aggressive" would this person describe a given text? Write your answer in the form of a Python list containing the appropriate attribute.\n\nText: {text}'

    return personalized_part + prompt

def get_labels(row):
    return np.array([['non-aggressive', 'aggressive'][int(row[ann_columns].values == 1)]])

selected_data['prompt'] = selected_data.apply(lambda row: get_prompt(row['comment'], row['worker_id']), axis=1)

In [11]:
selected_data['annotation'] = selected_data.apply(lambda row: get_labels(row)[0], axis=1)

selected_data.loc[:, ['prompt', 'annotation', 'comment']].reset_index(drop=True).to_csv('wiki_personalized_prompts.csv')

In [12]:
selected_data['annotation'].value_counts()

non-aggressive    801
aggressive        199
Name: annotation, dtype: int64

In [13]:
from scipy.stats import entropy

def get_class(class_name):
  return ['non-aggressive', 'aggressive'].index(class_name)

def _entropy(labels, base=None):
    _, counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=base)

_entropy([get_class(elem) for elem in selected_data.annotation.tolist()])

0.49901300026020734