# Reading in the data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# Read in the data from the  huggingface library

splits = {'validation': 'en/validation-00000-of-00001.parquet', 'test': 'en/test-00000-of-00001.parquet', 'train': 'en/train-00000-of-00001.parquet'}
train = pd.read_parquet("hf://datasets/unimelb-nlp/wikiann/" + splits["train"])
val = pd.read_parquet("hf://datasets/unimelb-nlp/wikiann/" + splits["validation"])
test = pd.read_parquet("hf://datasets/unimelb-nlp/wikiann/" + splits["test"])

In [None]:
# Extract 2000 rows from the training data

subset=train[train.index % 10 == 0].copy()

In [None]:
# Convert each list of tokens to a readable string

sentences=[]
for index, row in subset.iterrows():
    sent=" ".join(row['tokens'])
    sent="# New sentence = "+sent
    sentences.append(sent)

In [None]:
# Add the sentence strings before each list of strings row in the dataframe

expanded=subset.copy()
for i, sentence in reversed(list(enumerate(sentences))):
    line=pd.DataFrame({'tokens':sentence, 'ner_tags':[0], 'langs':['en'], 'spans':['']})
    expanded=pd.concat([expanded.iloc[:i], line, expanded.iloc[i:]]).reset_index(drop=True)

In [None]:
all_tokens=expanded.explode('tokens')
toks=all_tokens['tokens']
# toks.to_csv('gathered_results.csv', index=False, encoding='UTF-8', sep=';')

# Dividing it between us

In [None]:
A=expanded[0:1000]
B=expanded[1000:2000]
C=expanded[2000:3000]
D=expanded[3000:4000]

In [None]:
Am=pd.concat([A, D])
Amina=Am.explode('tokens')
Amina['labels']=0
Amina=Amina[['tokens', 'labels']]
# Amina.to_csv('Amina_annotations.csv', index=False, encoding='UTF-8', sep=';')

In [None]:
Li=pd.concat([A,B])
Lilja=Li.explode('tokens')
Lilja['labels']=0
Lilja=Lilja[['tokens', 'labels']]
# Lilja.to_csv('Lilja_annotations.csv', index=False, encoding='UTF-8', sep=';')

In [None]:
Mi=pd.concat([B,C])
Miko=Mi.explode('tokens')
Miko['labels']=0
Miko=Miko[['tokens', 'labels']]
# Miko.to_csv('Mikolaj_annotations.csv', index=False, encoding='UTF-8', sep=';')

In [None]:
Zo=pd.concat([C,D])
Zosia=Zo.explode('tokens')
Zosia['labels']=0
Zosia=Zosia[['tokens', 'labels']]
# Zosia.to_csv('Zosia_annotations.csv', index=False, encoding='UTF-8', sep=';')

# Reading the results back in

In [None]:
results=pd.read_csv('results.csv', sep=';', encoding='utf-8')

In [None]:
results=results.fillna('0')
results['labels1']=results['labels1'].astype(int)
results['labels2']=results['labels2'].astype(int)
results['labels1']=results['labels1'].astype(str)
results['labels2']=results['labels2'].astype(str)

In [None]:
sentence=''
YAY=0
FALSEPOSNEG=0
ORGLOCERR=0
LOCSPLITERR=0
locspliterrlist=[]
OTHERERR=0
weird_results=[]
annotator1_results=[]
annotator2_results=[]

location=False
for index, row in results.iterrows():
    
    token=row.iloc[0]
    label1=row.iloc[1]
    label2=row.iloc[2]
    
    if token[0]=='#':
        sentence=token[17:]
        continue
    labels1.append(label1)
    labels2.append(label2)
    
    if label1 == label2:
        YAY +=1
        
    else:
        if ( ( label1 in ['3','4'] ) and (label2 in ['5','6'] ) ) or ( ( label2 in ['3','4'] ) and (label1 in ['5','6'] ) ):
            ORGLOCERR+=1

        elif label1=='5' or label2=='5':
            location=True
            if label1=='0' or label2=='0':
                FALSEPOSNEG+=1
                continue
                
        elif ( label1=='0' and label2 in ['5','6'] ) or ( label2=='0' and label1 in ['5','6'] ) or (label1 in ['5', '6'] and label2 in ['5', '6']):
            LOCSPLITERR+=1
            locspliterrlist.append((sentence, token, label1, label2))

        elif label1=='0' or label2=='0':
            FALSEPOSNEG+=1
        else:
            OTHERERR+=1
            weird_results.append((sentence, token, label1, label2))

In [None]:
print("Correct results: ", YAY)
print("False positives or negatives: ", FALSEPOSNEG)
print("Disagreements of organisation or location: ", ORGLOCERR)
print("Location-splitting error (or perhaps just false negatives): ", LOCSPLITERR)
print("Nr of other unidentified errors: ", OTHERERR)

# Accuracy and Cohen's Kappa

In [None]:
# Function to calculate accuracy between two lists of labels
def calculate_accuracy(labels1, labels2):
    total_tokens = len(labels1)
    correct_tokens = sum(1 for label1, label2 in zip(labels1, labels2) if label1 == label2)
    accuracy = correct_tokens / total_tokens
    return accuracy

# Calculate accuracy
accuracy = calculate_accuracy(results['labels1'], results['labels2'])
print("Accuracy:", accuracy)

In [None]:
from collections import Counter

def cohen_kappa(annotation1, annotation2):
    #how much of each annotation
    count1 = Counter(annotation1)
    count2 = Counter(annotation2)
    
    # Observed agreement (P_o)
    observed_agreement = sum((a == b) for a, b in zip(annotation1, annotation2)) / len(annotation1)
    
    # Expected agreement (P_e)
    total = len(annotation1)
    categories = set(annotation1).union(set(annotation2)) # categories that appear in either of annotations
    expected_agreement = 0
    
    for category in categories:
        p1 = count1.get(category, 0) / total
        p2 = count2.get(category, 0) / total
        expected_agreement += p1 * p2
    
    # Cohen's Kappa calculation
    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement) #from the formula
    return kappa

# Calculate Cohen's Kappa
kappa = cohen_kappa(results['labels1'], results['labels2'])
print(f"Cohen's Kappa: {kappa}")

# F1 score

In [None]:
from sklearn.metrics import f1_score

silver = silver_data_list
gold_a = annotator1_results
gold_b = annotator2_results

y_true_relaxed = []
y_pred_relaxed = []

for i, pred in enumerate(silver):
    if pred == gold_a[i] or pred == gold_b[i]:
        y_true_relaxed.append(pred)  # it's a match, count it as correct
        y_pred_relaxed.append(pred)
    else:
        y_true_relaxed.append(gold_a[i])  # mark mismatch with one of the annotators
        y_pred_relaxed.append(pred)

print("Macro F1:", f1_score(y_true_relaxed, y_pred_relaxed, average='macro'))
print("Micro F1:", f1_score(y_true_relaxed, y_pred_relaxed, average='micro'))
print("Weighted F1:", f1_score(y_true_relaxed, y_pred_relaxed, average='weighted'))
print("Per-class F1:", f1_score(y_true_relaxed, y_pred_relaxed, average=None))


# Compare to silver data

In [None]:
silver_data=subset[['tokens', 'ner_tags']]
silver_data.head()

In [None]:
silver_data_list=[]
for index, row in silver_data.iterrows():
    tokens=row.iloc[0]
    tags=row.iloc[1]
    for token, tag in zip(tokens, tags):
        print(token, tag)
        silver_data_list.append(tag)