In [1]:
import shutil

import numpy as np
import pandas as pd

from pathlib import Path

from scipy.stats import entropy
from typing import List, Callable

In [2]:
data_dir = Path('..') / '..' / '..' / 'storage' / 'doccano' / '2023_ecai_nlperspectives'
annotations_path = data_dir / 'data_with_min_20_annotations' / 'annotations.csv'
texts_path = data_dir / 'data_with_min_20_annotations' / 'data.csv'

annotations_df = pd.read_csv(annotations_path, 
                             sep=',', 
                             dtype={'text_id': np.int32, 'user_id': np.int32, 'Pozytywne': np.int32, 'Negatywne': np.int32, 'Radość': np.int32, 
                                    'Zachwyt': np.int32, 'Inspiruje': np.int32, 'Spokój': np.int32, 'Zaskoczenie': np.int32, 'Współczucie': np.int32, 
                                    'Strach': np.int32, 'Smutek': np.int32, 'Wstręt': np.int32, 'Złość': np.int32, 'Ironiczny': np.int32, 
                                    'Żenujący': np.int32, 'Wulgarny': np.int32, 'Polityczny': np.int32, 'Interesujący': np.int32, 
                                    'Zrozumiały': np.int32, 'Zgadzam się z tekstem': np.int32, 'Wierzę w tę informację': np.int32, 
                                    'Potrzebuję więcej informacji, aby ocenić ten tekst': np.int32, 'Czuję sympatię do autora': np.int32, 
                                    'Obraża mnie': np.int32, 'Może kogoś atakować / obrażać / lekceważyć': np.int32, 'Mnie bawi/śmieszy?': np.int32, 
                                    'Może kogoś bawić?': np.int32})

texts_df = pd.read_csv(texts_path, sep=',', dtype={'text_id': np.int32, 'text': str})

annotations_df

Unnamed: 0,text_id,user_id,Pozytywne,Negatywne,Radość,Zachwyt,Inspiruje,Spokój,Zaskoczenie,Współczucie,...,Zrozumiały,Zgadzam się z tekstem,Wierzę w tę informację,"Potrzebuję więcej informacji, aby ocenić ten tekst",Czuję sympatię do autora,Obraża mnie,Może kogoś atakować / obrażać / lekceważyć,Mnie bawi/śmieszy?,Może kogoś bawić?,updated_at
0,0,25,0,2,0,0,0,0,0,2,...,10,1,3,0,0,0,8,-1,2,2022-12-21 17:31:49.631285+00
1,0,4,8,3,0,6,8,3,2,0,...,4,7,3,7,0,0,8,0,0,2022-12-06 20:37:25.096306+00
2,0,24,6,8,3,2,0,0,4,8,...,7,3,3,1,2,6,7,0,0,2022-12-07 10:38:29.634315+00
3,0,33,2,8,2,2,2,2,6,7,...,2,0,0,7,0,8,8,0,0,2022-12-11 22:04:28.854421+00
4,0,6,5,5,6,3,3,3,0,0,...,10,7,9,0,3,0,3,0,0,2022-12-20 16:19:10.727938+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31630,999,36,0,3,0,0,0,3,5,6,...,9,2,8,4,-1,2,3,0,0,2023-02-09 21:55:19.529524+00
31631,999,17,3,0,0,0,0,4,0,0,...,10,10,10,0,10,0,0,0,0,2023-02-07 11:45:56.639493+00
31632,999,18,6,0,0,0,0,0,0,0,...,8,10,10,0,8,0,0,0,0,2023-02-20 07:06:56.124013+00
31633,999,10,3,1,4,9,7,5,2,0,...,7,9,9,0,10,0,0,0,0,2023-02-02 14:41:27.420875+00


In [3]:
annotations_df.groupby('user_id')['text_id'].nunique()

user_id
0     1000
1      960
2      820
3     1000
4     1000
5      698
6      500
7      547
8     1000
9     1000
10    1000
11     160
12      50
13    1000
14     999
15    1000
16     732
17    1000
18     999
19     400
20    1000
21     360
22    1000
23     921
24     838
25     100
26     823
27    1000
28     570
29     965
30     958
31     860
32    1000
33    1000
34     300
35      75
36    1000
37    1000
38    1000
39    1000
Name: text_id, dtype: int64

In [4]:
# count the number of -1 values for every column
nominal_dont_know = annotations_df[annotations_df.columns[2:-1]].apply(lambda x: x.value_counts()).T[-1].sort_values(ascending=False)
nominal_dont_know = nominal_dont_know.fillna(0)

percentage_dont_know = nominal_dont_know / annotations_df.shape[0]
percentage_dont_know = round(percentage_dont_know * 100, 2)

# concatenate two series
pd.DataFrame({'nominal': nominal_dont_know, 'percentage': percentage_dont_know})

Unnamed: 0,nominal,percentage
Zgadzam się z tekstem,10108.0,31.95
Czuję sympatię do autora,7128.0,22.53
Wierzę w tę informację,6991.0,22.1
"Potrzebuję więcej informacji, aby ocenić ten tekst",2965.0,9.37
Zrozumiały,1793.0,5.67
Interesujący,1359.0,4.3
Ironiczny,1194.0,3.77
Polityczny,655.0,2.07
Żenujący,568.0,1.8
Wulgarny,240.0,0.76


In [5]:
# methods for calculating controversy


def _filter_only_zero_texts(annotations: pd.DataFrame,
                            annotation_columns: List[str],
                            verbose: bool = False):
    filtered_annotations = annotations.copy()
    
    if verbose:
        print(f'annotations texts: {len(annotations.text_id.unique().tolist())}')
    
    avg_df = filtered_annotations.groupby('text_id').mean(numeric_only=True)
    zero_avg_texts = avg_df[(avg_df[annotation_columns] == 0).all(axis=1)].index.tolist()
    
    if verbose:
        print(f'zero_avg_texts: {len(zero_avg_texts)}')
    
    nonzero_avg_texts = avg_df[(avg_df[annotation_columns] > 0).any(axis=1)].index.tolist()
    
    if verbose:
        print(f'nonzero_avg_texts len: {len(nonzero_avg_texts)}')
    
    return filtered_annotations[filtered_annotations['text_id'].isin(nonzero_avg_texts)]


def _get_text_controversy(annotations: pd.DataFrame,
                          annotation_columns: List[str], method: Callable,
                          mean: bool,
                          filter_only_zeros: bool,
                          verbose: bool = False):

    if filter_only_zeros:
        annotations = _filter_only_zero_texts(annotations, annotation_columns, verbose)

    texts_controversy_df = annotations.loc[:, ['text_id']].drop_duplicates(
    ).reset_index(drop=True)

    if isinstance(annotation_columns, str):
        annotation_columns = [annotation_columns]

    controversy_columns = [col + '_controversy' for col in annotation_columns]

    for annotation_col, controversy_col in zip(annotation_columns,
                                               controversy_columns):
        # filter our rows with -1 values
        temp_annotations = annotations.copy()[annotations[annotation_col] != -1]
        
        # calculate controversy
        text_controversy_dict = temp_annotations.groupby(
            'text_id')[annotation_col].apply(method).to_dict()
        texts_controversy_df[
            controversy_col] = texts_controversy_df.text_id.apply(
                text_controversy_dict.get)

    if mean:
        texts_controversy_df[
            'mean_controversy'] = texts_controversy_df.loc[:,
                                                           controversy_columns].mean(numeric_only=True, axis=1)
        texts_controversy_df = texts_controversy_df.loc[:, [
            'text_id', 'mean_controversy'
        ]]

    return texts_controversy_df

def get_texts_entropy(annotations: pd.DataFrame,
                      annotation_columns: List[str],
                      mean: bool =False,
                      filter_only_zeros: bool =False,
                      verbose: bool = False):
    """ Calculate entropy of text annotations.
    Args:
        annotations (pd.DataFrame): Dataframe with text annotations. It has to contain 'text_id' column.
        annotation_columns (str): Columns of annotations dataframe for which the entropy will be calculated
        mean (bool): If true, the entropy will be averaged over all columns.
        filter_only_zeros (bool): Filter out texts containing only zero ("0") annotations at each dimension.
    """

    def _entropy(labels, base=None):
        _, counts = np.unique(labels, return_counts=True)
        return entropy(counts, base=base)

    return _get_text_controversy(annotations, annotation_columns, _entropy,
                                 mean, filter_only_zeros, verbose)


def get_texts_std(annotations: pd.DataFrame,
                  annotation_columns: List[str],
                  mean=False,
                  filter_only_zeros=False,
                  verbose: bool = False):
    """ Calculate std of text annotations.
    Args:
        annotations (pd.DataFrame): Dataframe with text annotations. It has to contain 'text_id' column.
        annotation_columns (str): Columns of annotations dataframe for which the std will be calculated
        mean (bool): If true, the entropy will be averaged over all columns.
    """
    return _get_text_controversy(annotations, annotation_columns, np.std, mean, filter_only_zeros, verbose)

In [6]:
# get values of human biases (HuBi) for each annotator
def get_annotator_biases(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    text_means = get_texts_entropy(df, columns)
    column_mapping = [col[:-12] for col in text_means.columns[1:]]
    text_means.columns = ['text_id'] + column_mapping

    text_stds = get_texts_std(df, columns)
    column_mapping = [col[:-12] for col in text_stds.columns[1:]]
    text_stds.columns = ['text_id'] + column_mapping

    df = df.join(text_means, rsuffix='_mean', on='text_id').join(
        text_stds, rsuffix='_std', on='text_id')

    annotator_biases = df[['user_id']].copy().drop_duplicates().reset_index(drop=True)

    for col in columns:
        temp_df = df.copy()
        temp_df = temp_df[df[col] != -1]
        temp_df[col + '_bias'] = (temp_df[col] - temp_df[col + '_mean']) / (temp_df[col + '_std'] + 1e-8)
        col_user_biases = temp_df.groupby('user_id').mean(numeric_only=True).loc[:, col + '_bias']
        annotator_biases = annotator_biases.join(col_user_biases, on='user_id')

    annotator_biases['mean_bias'] = annotator_biases.loc[:, annotator_biases.columns.tolist()[1:]].mean(axis=1)

    return annotator_biases.sort_values('user_id', ascending=True)

In [7]:
task_columns = annotations_df.columns[2:-1].tolist()

controversy_df = get_texts_entropy(annotations_df, task_columns, mean=False, filter_only_zeros=True)
mean_controversy_df = get_texts_entropy(annotations_df, task_columns, mean=True, filter_only_zeros=True)

# merge controversy and mean controversy dataframes
controversy_df = controversy_df.merge(mean_controversy_df, on='text_id')

bias_df = get_annotator_biases(annotations_df, task_columns)

In [8]:
metric_dir = data_dir / 'metrics'

if metric_dir.exists():
    shutil.rmtree(metric_dir)

metric_dir.mkdir()

controversy_df.to_csv(metric_dir / 'text_controversy.csv', index=False)
bias_df.to_csv(metric_dir / 'user_bias.csv', index=False)
