In [1]:
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from sklearn.metrics import accuracy_score

In [2]:
task_2_dataset = pd.read_json('data/evaluation_data/task_2.json', lines=True)
task_2_dataset.rename(columns={'sentiment': 'sentiment_llm', 'aspect': 'aspect_llm', 'relevance_score': 'relevance_score_llm'}, inplace=True)

users = [i['user_id'] for i in task_2_dataset.loc[0]['responses']['relevance']]

questions = ["relevance", "sentiment", "aspect", "relevant_aspect", "relevance_score"]

#Create a column for each response field
for field in questions:
    task_2_dataset[field] = task_2_dataset['responses'].apply(lambda x: {i['user_id']: i['value'] for i in x[field]} if field in x else [])

group_all = task_2_dataset[task_2_dataset['metadata'].apply(lambda x: 'group-all' in x.values())]

In [3]:
from collections import Counter

sentiment_dict = {1: 'positive', 0: 'neutral', -1: 'negative'}
sentiment_dict_reverse = {'positive': 1, 'neutral': 0, 'negative': -1}

def find_overall_sentiment(row):
    '''
    We apply the rules described in the paper in order to determine the overall sentiment
    '''
    most_selected = row.max()
    columns_with_max = row[row == most_selected].index.tolist()
    
    if most_selected == 0:
        return []

    #Rules to determine sentiment if there are multiple different answers
    if len(columns_with_max) > 1:
        if 'count__negative' in columns_with_max and 'count__positive' in columns_with_max:
            #if we have same amount of pos and neg, take neutral
            return 'neutral'
        elif 'count__negative' in columns_with_max:
            #if we have more negative than positive, take negative
            return 'negative'
        elif 'count__positive' in columns_with_max:
            #If we have more positive than negative, take positive
            return 'positive'
    else:
        #If we have only one answer, return that
        return columns_with_max[0].split('__')[1]

def fill_relevant_aspect(row):
    '''
    If there was only one aspect selected by annotators, they didnt additionaly select a
    most relevant aspect - so in that case we have to fill with the selcted aspect
    '''

    if not row['aspect']:
        #in case they misclicked and only selected the relevant aspect, otherwise it should be []
        return row['relevant_aspect']

    new_relevant_aspect_dict = row['relevant_aspect']
    for user in users:
        if user not in row['aspect']:
            continue
        if len(row['aspect'][user]) == 1:
            if user in row['relevant_aspect']:
                continue
            else:
                if new_relevant_aspect_dict:
                    new_relevant_aspect_dict[user] = row['aspect'][user][0]
                else:
                    new_relevant_aspect_dict = {user: row['aspect'][user][0]}
    return new_relevant_aspect_dict

def find_overall_aspect(row, most_rel):
    '''
    We apply the rules described in the paper to find the overall aspect
    '''
    most_selected = row.max()
    columns_with_max = row[row == most_selected].index.tolist()
    
    if most_selected == 0:
        return []
    
    if len(columns_with_max) > 1:
        #if theres more than one answer, look at the most_relevant aspect selected by annotators
        if len([i for i in columns_with_max if i != 'aspcount__not_sure']) == 1:
            return [i for i in columns_with_max if i != 'aspcount__not_sure'][0].split('__')[1]
        elif len(list(most_rel.values())) > 1:
            #if we have more than one most relevant aspect, take the one that is most selected
            most_selected_2 = Counter(most_rel.values())
            max_count = max(most_selected_2.values())
            columns_with_max_2 = [value for value, count in most_selected_2.items() if count == max_count]
            if 'not_sure' in columns_with_max_2:
                if len([i for i in columns_with_max_2 if i != 'not_sure']) == 1:
                    return [i for i in columns_with_max_2 if i != 'not_sure'][0]
            if len(columns_with_max_2) > 1:
                #if the two most relevant aspects are selected equally, take the first one randomly
                return list(most_rel.values())[0]
            else:
                #take the most rel aspect that is most selected
                return columns_with_max_2[0]
        else:
            #if theres only one most relevant aspect, return that
            return list(most_rel.values())[0]
    else:
        #if theres only one answer, return that
        return columns_with_max[0].split('__')[1]

In [4]:
# irrelevant_samples are samples with at least one irrelevant label
#Vary the cutoff to determine how many irrelevant labels are needed to consider a sample irrelevant
cutoff_irrelevant = 5   #1 - one irrelevant is enough to consider a sample irrelevant
irrelevant_samples = [True if len([j for j in list(i.values()) if j == 'no']) > (cutoff_irrelevant-1) else False for i in group_all['relevance'].to_list() if i]

### To reproduce Relevancy

In [5]:
#ratio of relevant answers - reproduce results by varying cutoff_irrelevant above
1-len([i for i in irrelevant_samples if i])/len(irrelevant_samples)

0.9879518072289156

## For Sentiment and Aspect

In [6]:
field = 'relevance'
answer = 'no'
search_df = task_2_dataset.copy()

#We apply some rule-based logic to determine the overlapping final sentiment from all users
search_df['all_sentiments'] = search_df['sentiment'].apply(lambda x: sum(list(x.values()), []) if x else [])
search_df['count__positive'] = search_df['all_sentiments'].apply(lambda x: x.count('positive'))
search_df['count__negative'] = search_df['all_sentiments'].apply(lambda x: x.count('negative'))
search_df['count__neutral'] = search_df['all_sentiments'].apply(lambda x: x.count('neutral'))
search_df['count__not_sure'] = search_df['all_sentiments'].apply(lambda x: x.count('not_sure'))

search_df['sentiment_user'] = search_df.apply(lambda x: find_overall_sentiment(x[['count__positive', 'count__negative', 'count__neutral', 'count__not_sure']]), axis=1)


#There are different ways to find the overall aspect - either through majority voting of selected ones or considering most relevant one
search_df['all_aspects'] = search_df['aspect'].apply(lambda x: sum(list(x.values()), []) if x else [])
search_df['aspcount__environmental'] = search_df['all_aspects'].apply(lambda x: x.count('environmental'))
search_df['aspcount__social'] = search_df['all_aspects'].apply(lambda x: x.count('social'))
search_df['aspcount__governance'] = search_df['all_aspects'].apply(lambda x: x.count('governance'))
search_df['aspcount__not_sure'] = search_df['all_aspects'].apply(lambda x: x.count('not_sure'))

#if the user only selected one aspect and didnt select a relevant aspect, we fill the relevant aspect with the selected one
search_df['relevant_aspect'] = search_df.apply(lambda x: fill_relevant_aspect(x), axis=1)
search_df['aspect_user'] = search_df.apply(lambda x: find_overall_aspect(x[['aspcount__environmental', 'aspcount__social', 'aspcount__governance', 'aspcount__not_sure']], x['relevant_aspect']), axis=1)

relevant_samples = [True if len([j for j in list(i.values()) if j == 'no']) == 0 else False for i in search_df['relevance'].to_list() if i]

In [7]:
#Continue only with relevant samples
df = search_df[search_df[field].astype(bool)]
rel_df = df.loc[relevant_samples]

### To reproduce Sentiment

#### Interannotator Agreement

In [8]:
#Extract all sentiment answers into their own columns
sentiment_agreement = group_all.copy()
for user in users:
    sentiment_agreement['sentiment_' + user] = sentiment_agreement['sentiment'].apply(lambda x: x[user] if user in x else [])

for user in users:
    sentiment_agreement['sentiment_' + user] = sentiment_agreement['sentiment_' + user].apply(lambda x: [i for i in x if i in ['neutral', 'positive', 'negative']])

In [9]:
pos_sent_dict = {'negative': 0, 'neutral': 1, 'positive': 2}

z = []
for index, row in sentiment_agreement.iterrows():
    z_iter = []
    status=0
    for user in users:
        row_sentiment = [pos_sent_dict[i] for i in row['sentiment_' + user]]
        if len(row_sentiment) > 1:
            if 0 in row_sentiment and 2 in row_sentiment:
                row_sentiment = 1
            elif 0 in row_sentiment:
                row_sentiment = 0
            elif 2 in row_sentiment:
                row_sentiment = 2
        elif row_sentiment == []:
            status=1
            continue
        else:
            row_sentiment = row_sentiment[0]
        z_iter.append(row_sentiment)
    if status == 0:
        z.append(z_iter)

z, _ = aggregate_raters(z)

In [10]:
fleiss_kappa(z)

np.float64(0.8178652738717019)

#### Sentiment Accuracy

In [11]:
#accuracy of simplified user determined sentiment compared to the original sentiment

rel_df['sentiment_user_int'] = rel_df['sentiment_user'].apply(lambda x: sentiment_dict_reverse[x] if x in sentiment_dict_reverse else 0)
rel_df['sentiment_llm_int'] = rel_df['sentiment_llm'].apply(lambda x: sentiment_dict_reverse[x] if x in sentiment_dict_reverse else 0)
accuracy_score(rel_df['sentiment_user_int'], rel_df['sentiment_llm_int'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rel_df['sentiment_user_int'] = rel_df['sentiment_user'].apply(lambda x: sentiment_dict_reverse[x] if x in sentiment_dict_reverse else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rel_df['sentiment_llm_int'] = rel_df['sentiment_llm'].apply(lambda x: sentiment_dict_reverse[x] if x in sentiment_dict_reverse else 0)


0.7986798679867987

### Aspect

In [12]:
#For group all, find annotator agreement
aspect_dict = {'environmental': 1, 'social': 2, 'governance': 3}

#Extract all asepct answers into their own columns
aspect_agreement = group_all.copy()
for user in users:
    aspect_agreement['aspect_' + user] = aspect_agreement['aspect'].apply(lambda x: x[user] if user in x else [])

#Take most relevant aspect if there are multiple
aspect_agreement['user_decided_aspect'] = [''] * len(aspect_agreement)
for index, row in aspect_agreement.iterrows():
    agg_aspects = []
    for user in users:
        if 'not_sure' in row['aspect_' + user]:
            if len(row['aspect_' + user]) > 1:
                row_aspect = [i for i in row['aspect_' + user] if i != 'not_sure'][0]
            else:
                row_aspect = []
        elif len(row['aspect_' + user]) > 1:
            if user in row['relevant_aspect']:
                row_aspect = row['relevant_aspect'][user]
            else:
                row_aspect = row['aspect_' + user][0]
        else:
            row_aspect = row['aspect_' + user][0] if row['aspect_' + user] else []
        aspect_agreement.at[index, 'aspect_' + user] = row_aspect
        agg_aspects.append(row_aspect)
    aspect_agreement.at[index, 'user_decided_aspect'] = agg_aspects
    



for user in users:
    # aspect_agreement['aspect_' + user] = aspect_agreement['aspect_' + user].apply(lambda x: [aspect_dict[i] for i in x if i in aspect_dict])
    aspect_agreement['aspect_' + user] = aspect_agreement['aspect_' + user].apply(lambda x: aspect_dict[x] if x else x)


#### Interannotator Agreement

In [13]:
z = []
for index, row in aspect_agreement.iterrows():
    z_iter = []
    status=0
    for user in users:
        row_aspect = row['aspect_' + user]
        if row_aspect == []:
            status=1
            continue
        z_iter.append(row_aspect)
    if status == 0:
        z.append(z_iter)

z, _ = aggregate_raters(z)

In [14]:
fleiss_kappa(z)

np.float64(0.42705981362124895)

#### Accuracy

In [15]:
rel_df_no_unsure = rel_df[rel_df['aspect_user'] != 'not_sure']
accuracy_score(rel_df_no_unsure['aspect_user'], rel_df_no_unsure['aspect_llm'].str.lower())

0.7847682119205298

### To reproduce Relevance-Score

In [16]:
cutoff_irrelevant = 1   #1 - one irrelevant is enough to consider a sample irrelevant
irrelevant_samples = [True if len([j for j in list(i.values()) if j == 'no']) > (cutoff_irrelevant-1) else False for i in task_2_dataset['relevance'].to_list() if i]

In [17]:
all_counts = search_df[search_df[field].astype(bool)]['relevance_score_llm'].value_counts()
all_counts

relevance_score_llm
8    141
7    101
6     63
9     50
5      8
4      6
Name: count, dtype: int64

In [18]:
irrel_counts = search_df[search_df[field].astype(bool)].loc[irrelevant_samples]['relevance_score_llm'].value_counts()
irrel_counts

relevance_score_llm
6    33
7    19
8     7
4     3
5     3
9     1
Name: count, dtype: int64

In [19]:
#Ratio of irrelevant samples (judged by annotators) wrt relevance score
irrel_counts/all_counts

relevance_score_llm
4    0.500000
5    0.375000
6    0.523810
7    0.188119
8    0.049645
9    0.020000
Name: count, dtype: float64