In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame as df

In [2]:
groups = {
    "A" : ['conll_2_in_gold.csv', 'conll_2_not_in_gold.csv'],
    "B" : ['conll_3_in_gold.csv', 'conll_3_not_in_gold.csv'],
    "C" : ['conll_3_train_in_gold.csv', 'conll_3_train_not_in_gold.csv'],
    "D" : ['conll_4_in_gold.csv', 'conll_4_not_in_gold.csv'],
    "E" : ['conll_4_train_in_gold.csv', 'conll_4_train_not_in_gold.csv']
}

In [3]:
sets = ['model_outputs/', 'human_labels/', 'human_labels_auditted/']

In [189]:
def combine_groups(group, current_set):
    """
    Takes in 2 files (_in_gold and _not_in_gold) from the same group and combines them
    Removes duplicates, and for human annotated files, filters out entries that had "None" as error type
    Returns a dataframe with 3 columns: num_models, span, entity_type
    """
    print("Combining files {} from folder {}".format(group, current_set))
    if current_set == 'model_outputs/':
        my_df = pd.read_csv(current_set+group[0], usecols=['num_models', 'corpus_span', 'corpus_ent_type'], encoding='latin-1')
        my_df_2 = pd.read_csv(current_set+group[1], usecols=['num_models', 'model_span', 'model_ent_type'], encoding='latin-1')
        # Adjust the numbers in `_not_in_gold.csv` files to match those in `_in_gold.csv` files
        #my_df_2['num_models'] = 17 - my_df_2['num_models']
        my_df = my_df.rename(columns={"corpus_span": "span", "corpus_ent_type": "ent_type"})
        my_df_2 = my_df_2.rename(columns={"model_span": "span", "model_ent_type": "ent_type"})
        combined_df = pd.concat([my_df, my_df_2])
        combined_df = combined_df.drop_duplicates(subset=["span", "ent_type"])
    else:
        my_df = pd.read_csv(current_set+group[0], usecols=['num_models', 'corpus_span', 'corpus_ent_type', 'error_type'], encoding='latin-1')
        my_df_2 = pd.read_csv(current_set+group[1], usecols=['num_models', 'model_span', 'model_ent_type', 'error_type'], encoding='latin-1')
        # Adjust the numbers in `_not_in_gold.csv` files to match those in `_in_gold.csv` files
        #my_df_2['num_models'] = 17 - my_df_2['num_models']
        my_df = my_df.rename(columns={"corpus_span": "span", "corpus_ent_type": "ent_type"})
        my_df_2 = my_df_2.rename(columns={"model_span": "span", "model_ent_type": "ent_type"})
        combined_df = pd.concat([my_df, my_df_2])
        combined_df = combined_df.loc[combined_df['error_type'].isin(['Wrong','Token','Sentence','Span','Tag','Both', 'Missing'])]
        combined_df = combined_df.drop_duplicates(subset=["span", "ent_type"])
        combined_df = combined_df.drop('error_type', axis=1)
    combined_df = combined_df.dropna()
    
    return combined_df

In [190]:
# Generate the 5 groups (A, B, C, D, E) for each of our 3 sets (model_outputs, human_annotator, human_auditor)
# final_set[group] is a list containing the 5 groups [A,B,C,D,E] for each set
final_set = {
    sets[0]: [],
    sets[1]: [],
    sets[2]: []
}
for s in sets:
    for g in groups:
        final_set[s].append(combine_groups(groups[g], s))

Combining files ['conll_2_in_gold.csv', 'conll_2_not_in_gold.csv'] from folder model_outputs/
Combining files ['conll_3_in_gold.csv', 'conll_3_not_in_gold.csv'] from folder model_outputs/
Combining files ['conll_3_train_in_gold.csv', 'conll_3_train_not_in_gold.csv'] from folder model_outputs/
Combining files ['conll_4_in_gold.csv', 'conll_4_not_in_gold.csv'] from folder model_outputs/
Combining files ['conll_4_train_in_gold.csv', 'conll_4_train_not_in_gold.csv'] from folder model_outputs/
Combining files ['conll_2_in_gold.csv', 'conll_2_not_in_gold.csv'] from folder human_labels/
Combining files ['conll_3_in_gold.csv', 'conll_3_not_in_gold.csv'] from folder human_labels/
Combining files ['conll_3_train_in_gold.csv', 'conll_3_train_not_in_gold.csv'] from folder human_labels/
Combining files ['conll_4_in_gold.csv', 'conll_4_not_in_gold.csv'] from folder human_labels/
Combining files ['conll_4_train_in_gold.csv', 'conll_4_train_not_in_gold.csv'] from folder human_labels/
Combining files [

In [191]:
def pairwise_compare(df1, df2):
    """
    This function compares pairwise the different dataframes we generated above
    We are interested in the number of overlapping (span, entity) pairs between the files
    Additionally we can also see how many models agreed with that pair (on average)
    Returns:
        - avg_overlap -> [0,1] indicates the % of entries overlapping between the 2 files on avg
        - mean_num_models -> the mean number of models which produced the outputs in the overlap
        - mean_num_models_diff -> the mean number of models by which the 2 files differ for the same row/record
    """
    df_joined = pd.merge(df1, df2, on=['span', 'ent_type'])
    # the "num_models_diff" field indicates what was the difference in the number of models which agreed to a certain (span, entity) pair
    df_joined['num_models_diff'] = abs(df_joined['num_models_x'] - df_joined['num_models_y'])
    #df_joined = df_joined.drop_duplicates(subset=["span", "ent_type"])
    combined_df = pd.concat([df1, df2])
    combined_df = combined_df.drop_duplicates(subset=["span", "ent_type"])
    print(combined_df.shape[0])
    jackaard_score = df_joined.shape[0]/(df1.shape[0]+df2.shape[0]-df_joined.shape[0])#combined_df.shape[0]
    mean_num_models = 0.5*(df_joined['num_models_x'].mean() + df_joined['num_models_y'].mean())
    mean_num_models_diff = df_joined['num_models_diff'].mean()
    
    return jackaard_score
    

In [192]:
pairwise_compare(final_set[sets[2]][2], final_set[sets[2]][4])

535


0.7551401869158878

In [193]:
results = {
    sets[0] : [], 
    sets[1] : [],
    sets[2] : []
}

for s in final_set:
    results[s].append(pairwise_compare(final_set[s][0], final_set[s][1]))
    results[s].append(pairwise_compare(final_set[s][0], final_set[s][3]))
    results[s].append(pairwise_compare(final_set[s][1], final_set[s][3]))
    results[s].append(pairwise_compare(final_set[s][2], final_set[s][4]))

24919
24810
17180
29224
658
631
675
544
613
629
636
535


In [194]:
results

{'model_outputs/': [0.5153096031140897,
  0.5178557033454252,
  0.8220023282887078,
  0.8706542567752532],
 'human_labels/': [0.24620060790273557,
  0.25039619651347067,
  0.6488888888888888,
  0.7481617647058824],
 'human_labels_auditted/': [0.2626427406199021,
  0.2480127186009539,
  0.6839622641509434,
  0.7551401869158878]}