In [19]:
BASELINE_CSV_FILE = '../annotated-dataset/matched_pairs.csv'
DIR_WITH_RESULTS = './detection-output'

SYSTEMS = ['single-zeroshot', 'single-fewshot', 'zeroshot', 'fewshot'] # Used for old system names
NEW_SYSTEMS = ['zeroshot-single', 'fewshot-single', 'zeroshot-two-pass', 'fewshot-two-pass'] # < use this for new system names


In [20]:
# Imports
import json
import os
from nltk.tokenize import word_tokenize
import pandas as pd

In [21]:
# Load the baseline CSV file
baseline = pd.read_csv(BASELINE_CSV_FILE, encoding='utf-8')
baseline['sentence'] = baseline['sentence_a'] # we take the sentence from the first annotator
baseline['text'] = baseline['text_a'] # we take the text from the first annotator
baseline = baseline.dropna(subset=['sentence', 'text'])

# Load the results into a DataFrame
valid = pd.DataFrame(columns=['minute_id', 'system', 'reference_id', 'text', 'sentence', 'document_type', 'reference_type'])
files = [file for file in os.listdir(DIR_WITH_RESULTS) if file.endswith('.json')
            and (('validation' in file) or ('single' in file))] #
print(f"Found {len(files)} files in {DIR_WITH_RESULTS} matching criteria.")
for file in files:
    file_path = file.split('_')
    minute_id = file_path[0]
    system = file_path[1].replace('.json', '').replace('-validation', '').replace('-detection', '')
    with open(os.path.join(DIR_WITH_RESULTS, file), 'r', encoding='utf-8') as infile:
        references = json.load(infile).get('validated_references')
        for item in references:
            # Ensure the item is valid as per LLM classifications
            if item.get('is_valid', False) and item.get('confidence_score') >= 75:
                valid.loc[len(valid)] = [
                    minute_id,
                    system,
                    item.get('reference_id', ''),
                    item.get('reference_text', ''),
                    item.get('sentence', ''),
                    item.get('document_type', ''),
                    item.get('reference_type', '')
                ]

print(valid.shape)
valid.head()

Found 20 files in ./detection-output matching criteria.
(844, 7)


Unnamed: 0,minute_id,system,reference_id,text,sentence,document_type,reference_type
0,h-tk-20182019-64-32,zeroshot,1,een aangehouden motie,Aan de orde is de stemming over een aangehoude...,Motie,impl-ext-parl-doc
1,h-tk-20182019-64-32,zeroshot,2,"de initiatiefnota van de leden Ploumen, Van Ge...",Aan de orde is de stemming over een aangehoude...,Initiatiefnota,impl-ext-parl-doc
2,h-tk-20182019-64-32,zeroshot,3,de motie-Van den Berg over onderzoek naar biom...,te weten: * -de motie-Van den Berg over onderz...,Motie,impl-ext-parl-doc
3,h-tk-20182019-64-32,zeroshot,4,"34834, nr. 9",te weten: * -de motie-Van den Berg over onderz...,Motie,explicit-parl-doc
4,h-tk-20182019-64-32,zeroshot,5,notaoverleg van 10 december 2018,(Zie notaoverleg van 10 december 2018.),Verslag,impl-ext-parl-doc


In [22]:
# See what annotations we are able to link to the baseline, and what not (calculate TP, FP, FN)
import numpy as np


def panoptic_overlap_match(text1, text2):
    """
    Returns True if the overlap between text1 ad text2 satsifes:
    |A ∩ B| > 0.5|A| and |A ∩ B| > 0.5|B|
    """
    set1 = set(word_tokenize(text1))
    set2 = set(word_tokenize(text2))
    if not set1 or not set2:
        return False
    intersection = set1 & set2
    return len(intersection) > 0.5 * len(set1) and len(intersection) > 0.5 * len(set2)

eval_df = pd.DataFrame(columns=['minute_id', 'system', 'TP', 'FP', 'FN', 'n_annotations', 'doctype_acc', 'reftype_acc', 'expl-vs-impl_acc'])

for i, system in enumerate(SYSTEMS):
    for minute_id in valid['minute_id'].unique():
        # Get the baseline annotations for this minute_id
        baseline_annotations = baseline[baseline['minute_id'] == minute_id]

        # Get the results for this system and minute_id
        system_results = valid[(valid['minute_id'] == minute_id) & (valid['system'] == system)]

        # Set the variables for the evaluation
        TP = 0; FP = 0; baseline_used = set()
        ref_correct = 0; ref_total = 0; doc_correct = 0; doc_total = 0; expl_impl_correct = 0;

        print(f"Evaluating system: {system}, minute_id: {minute_id}, len: baseline: {len(baseline_annotations)}, len: system: {len(system_results)}")

        for _, system_row in system_results.iterrows():
            best_match = None

            # Check each system result against the baseline annotations
            # If a match is found, it is a True Positive (TP)
            # If no match is found, it is a False Positive (FP)
            for _, baseline_row in baseline_annotations.iterrows():
                # See whether there is a match based on the panoptic overlap
                sentence_match = panoptic_overlap_match(system_row['sentence'], baseline_row['sentence'])
                text_match = panoptic_overlap_match(system_row['text'], baseline_row['text'])

                if sentence_match and text_match and baseline_row.to_string() not in baseline_used:
                    best_match = baseline_row.to_string()
                    break
            if best_match is not None:
                TP += 1
                baseline_used.add(best_match)

                # Check the document type and reference type for accuracy
                if system_row['document_type'] not in (None, '', 'nan') and baseline_row['document_type_b'] not in (None, '', 'nan'):
                    doc_total += 1
                    if system_row['document_type'] == baseline_row['document_type_b']:
                        doc_correct += 1

                if system_row['reference_type'] not in (None, '', 'nan') and baseline_row['reference_type_b'] not in (None, '', 'nan'):
                    ref_total += 1
                    if system_row['reference_type'] == baseline_row['reference_type_b']:
                        ref_correct += 1
                    if 'impl-' in system_row['reference_type'] and 'impl-' in baseline_row['reference_type_b']:
                        expl_impl_correct += 1
                    if 'explicit' in system_row['reference_type'] and 'explicit' in baseline_row['reference_type_b']:
                        expl_impl_correct += 1
            else:
                FP += 1

        # Append the results to the evaluation DataFrame
        FN = len(baseline_annotations) - TP
        doctype_acc = doc_correct / doc_total if doc_total > 0 else np.nan
        reftype_acc = ref_correct / ref_total if ref_total > 0 else np.nan
        expl_impl_acc = expl_impl_correct / ref_total if ref_total > 0 else np.nan

        eval_df.loc[len(eval_df)] = [minute_id, NEW_SYSTEMS[i], TP, FP, FN,  len(system_results), reftype_acc, doctype_acc, expl_impl_acc]

eval_df.head()


Evaluating system: single-zeroshot, minute_id: h-tk-20182019-64-32, len: baseline: 2, len: system: 13
Evaluating system: single-zeroshot, minute_id: h-tk-20022003-3055-3080, len: baseline: 75, len: system: 30
Evaluating system: single-zeroshot, minute_id: h-tk-20182019-35-8-n1, len: baseline: 48, len: system: 79
Evaluating system: single-zeroshot, minute_id: h-tk-20072008-2932-2933, len: baseline: 44, len: system: 37
Evaluating system: single-zeroshot, minute_id: h-tk-20012002-4369-4373, len: baseline: 22, len: system: 47
Evaluating system: single-fewshot, minute_id: h-tk-20182019-64-32, len: baseline: 2, len: system: 8
Evaluating system: single-fewshot, minute_id: h-tk-20022003-3055-3080, len: baseline: 75, len: system: 48
Evaluating system: single-fewshot, minute_id: h-tk-20182019-35-8-n1, len: baseline: 48, len: system: 76
Evaluating system: single-fewshot, minute_id: h-tk-20072008-2932-2933, len: baseline: 44, len: system: 37
Evaluating system: single-fewshot, minute_id: h-tk-20012

Unnamed: 0,minute_id,system,TP,FP,FN,n_annotations,doctype_acc,reftype_acc,expl-vs-impl_acc
0,h-tk-20182019-64-32,zeroshot-single,1,12,1,13,1.0,0.0,1.0
1,h-tk-20022003-3055-3080,zeroshot-single,5,25,70,30,1.0,0.6,1.0
2,h-tk-20182019-35-8-n1,zeroshot-single,45,34,3,79,0.955556,0.022222,0.955556
3,h-tk-20072008-2932-2933,zeroshot-single,23,14,21,37,1.0,0.73913,1.0
4,h-tk-20012002-4369-4373,zeroshot-single,7,40,15,47,1.0,0.857143,1.0


In [23]:
eval_df['precision'] = eval_df['TP'] / (eval_df['TP'] + eval_df['FP'])
eval_df['recall'] = eval_df['TP'] / (eval_df['TP'] + eval_df['FN'])
eval_df['f1_score'] = 2 * (eval_df['precision'] * eval_df['recall']) / (eval_df['precision'] + eval_df['recall'])
eval_df

Unnamed: 0,minute_id,system,TP,FP,FN,n_annotations,doctype_acc,reftype_acc,expl-vs-impl_acc,precision,recall,f1_score
0,h-tk-20182019-64-32,zeroshot-single,1,12,1,13,1.0,0.0,1.0,0.076923,0.5,0.133333
1,h-tk-20022003-3055-3080,zeroshot-single,5,25,70,30,1.0,0.6,1.0,0.166667,0.066667,0.095238
2,h-tk-20182019-35-8-n1,zeroshot-single,45,34,3,79,0.955556,0.022222,0.955556,0.56962,0.9375,0.708661
3,h-tk-20072008-2932-2933,zeroshot-single,23,14,21,37,1.0,0.73913,1.0,0.621622,0.522727,0.567901
4,h-tk-20012002-4369-4373,zeroshot-single,7,40,15,47,1.0,0.857143,1.0,0.148936,0.318182,0.202899
5,h-tk-20182019-64-32,fewshot-single,1,7,1,8,1.0,0.0,1.0,0.125,0.5,0.2
6,h-tk-20022003-3055-3080,fewshot-single,9,39,66,48,1.0,0.333333,1.0,0.1875,0.12,0.146341
7,h-tk-20182019-35-8-n1,fewshot-single,45,31,3,76,0.955556,0.333333,0.955556,0.592105,0.9375,0.725806
8,h-tk-20072008-2932-2933,fewshot-single,23,14,21,37,1.0,0.73913,1.0,0.621622,0.522727,0.567901
9,h-tk-20012002-4369-4373,fewshot-single,7,36,15,43,1.0,0.857143,1.0,0.162791,0.318182,0.215385


In [25]:
# Sum the results for each system
summary_df = eval_df.groupby('system').agg({
    'TP': 'sum',
    'FP': 'sum',
    'FN': 'sum',
    'n_annotations': 'sum',
    'precision': 'mean',
    'recall': 'mean',
    'f1_score': 'mean',
    'doctype_acc': 'mean',
    'reftype_acc': 'mean',
    'expl-vs-impl_acc': 'mean'
}).reset_index()

print(summary_df.to_markdown(index=False, floatfmt=".2f"))

| system            |   TP |   FP |   FN |   n_annotations |   precision |   recall |   f1_score |   doctype_acc |   reftype_acc |   expl-vs-impl_acc |
|:------------------|-----:|-----:|-----:|----------------:|------------:|---------:|-----------:|--------------:|--------------:|-------------------:|
| fewshot-single    |   85 |  127 |  106 |             212 |        0.34 |     0.48 |       0.37 |          0.99 |          0.45 |               0.99 |
| fewshot-two-pass  |  105 |  124 |   86 |             229 |        0.40 |     0.47 |       0.53 |          0.96 |          0.41 |               0.99 |
| zeroshot-single   |   81 |  125 |  110 |             206 |        0.32 |     0.47 |       0.34 |          0.99 |          0.44 |               0.99 |
| zeroshot-two-pass |   89 |  108 |  102 |             197 |        0.37 |     0.42 |       0.48 |          0.99 |          0.64 |               0.99 |
