In [1]:
BASELINE_CSV_FILE = '../annotated-dataset/matched_pairs.csv'
DIR_WITH_RESULTS = './reconciliation-experiments'

SYSTEM = ['zeroshot', 'fewshot']
NEW_SYSTEMS = ['zeroshot-two-pass', 'fewshot-two-pass']

FEATURE_COMBOS = [
    'text',
    'text+sentence',
    'text+sentence+keywords',
    'text+sentence+summary',
    'text+sentence+keywords',
    'text+sentence+keywords-year',
    'text+sentence+keywords-doctype',
    'text+sentence+keywords-year+doctype',
    'text+sentence+keywords+summary',
    'text+sentence+keywords+summary-year',
    'text+sentence+keywords+summary-doctype',
    'text+sentence+keywords+summary-year+doctype',
]


In [2]:
# Imports
import json
import os
from nltk.tokenize import word_tokenize
import pandas as pd

In [3]:
# Load the baseline CSV file
baseline = pd.read_csv(BASELINE_CSV_FILE, encoding='utf-8')
baseline['sentence'] = baseline['sentence_a'] # we take the sentence from the first annotator
baseline['text'] = baseline['text_a'] # we take the text from the first annotator
baseline = baseline.dropna(subset=['sentence', 'text'])

# Load the results into a DataFrame
valid_candidates = pd.DataFrame(columns=['minute_id', 'system', 'reference_id', 'combo', 'id', 'similarity_score', 'title'])
valid = pd.DataFrame(columns=['minute_id', 'system', 'reference_id', 'text', 'sentence', 'document_type', 'reference_type'])

files = [file for file in os.listdir(DIR_WITH_RESULTS) if file.endswith('.json')]
print(f"Found {len(files)} files in {DIR_WITH_RESULTS} matching criteria.")
for file in files:
    file_path = file.split('_')
    minute_id = file_path[0]
    system = file_path[1].replace('.json', '').replace('-validation', '').replace('-detection', '')
    feature_combo = file_path[len(file_path)-1].replace('.json', '')
    with open(os.path.join(DIR_WITH_RESULTS, file), 'r', encoding='utf-8') as infile:
        references = json.load(infile)
        for item in references:
            # Filter dossiers
            if item.get('reference_type') in ['impl-ext-third-party', 'impl-ext-dossier', 'explicit-dossier']:
                continue

            # Ensure the item is valid as per LLM classifications
            if item.get('is_valid', False) and item.get('confidence_score') >= 75:
                valid.loc[len(valid)] = [
                    minute_id,
                    system,
                    item.get('reference_id', ''),
                    item.get('reference_text', ''),
                    item.get('sentence', ''),
                    item.get('document_type', ''),
                    item.get('reference_type', '')
                ]
                for ranked_candidates in item.get('ranked_candidates', []):
                    valid_candidates.loc[len(valid_candidates)] = [
                        minute_id,
                        system,
                        item.get('reference_id', ''),
                        feature_combo,
                        ranked_candidates.get('id'),
                        ranked_candidates.get('similarity_score'),
                        ranked_candidates.get('title')
                    ]

print(valid.shape)
display(valid.head())
print(valid_candidates.shape)
valid_candidates.head()

Found 110 files in ./reconciliation-experiments matching criteria.
(4334, 7)


Unnamed: 0,minute_id,system,reference_id,text,sentence,document_type,reference_type
0,h-tk-20182019-35-8-n1,zeroshot,2,"de initiatiefnota van de leden Ploumen, Van Ge...",Ik stel voor dinsdag 18 december aanstaande oo...,Nota,impl-ext-parl-doc
1,h-tk-20182019-35-8-n1,zeroshot,3,"32813, nr. 257",Ik deel aan de Kamer mee dat de fractie van D6...,Motie,explicit-parl-doc
2,h-tk-20182019-35-8-n1,zeroshot,4,deze motie,Ik deel aan de Kamer mee dat de fractie van D6...,Motie,impl-local
3,h-tk-20182019-35-8-n1,zeroshot,5,21501-07-1559,Ik stel voor de volgende stukken van de stand ...,,explicit-parl-doc
4,h-tk-20182019-35-8-n1,zeroshot,6,25424-430,Ik stel voor de volgende stukken van de stand ...,,explicit-parl-doc


(38124, 7)


Unnamed: 0,minute_id,system,reference_id,combo,id,similarity_score,title
0,h-tk-20182019-35-8-n1,zeroshot,2,text+sentence+keywords-year,kst-34834-9,1.811558,"Initiatiefnota van de leden Ploumen, Van Gerve..."
1,h-tk-20182019-35-8-n1,zeroshot,2,text+sentence+keywords-year,kst-34834-5,1.655683,"Initiatiefnota van de leden Dijksma, Kooiman e..."
2,h-tk-20182019-35-8-n1,zeroshot,2,text+sentence+keywords-year,kst-21501-32-1079,1.629147,Landbouw- en Visserijraad; Motie; Motie van he...
3,h-tk-20182019-35-8-n1,zeroshot,2,text+sentence+keywords-year,kst-31765-309,1.605949,Kwaliteit van zorg ; Motie; Motie van het lid ...
4,h-tk-20182019-35-8-n1,zeroshot,2,text+sentence+keywords-year,kst-34960-7,1.600116,Voorjaarsnota 2018; Motie; Motie van de leden ...


In [6]:
# See what annotations we are able to link to the baseline, and what not (calculate TP, FP, FN)
import numpy as np


def panoptic_overlap_match(text1, text2):
    """
    Returns True if the overlap between text1 ad text2 satsifes:
    |A ∩ B| > 0.5|A| and |A ∩ B| > 0.5|B|
    """
    set1 = set(word_tokenize(text1))
    set2 = set(word_tokenize(text2))
    if not set1 or not set2:
        return False
    intersection = set1 & set2
    return len(intersection) > 0.5 * len(set1) and len(intersection) > 0.5 * len(set2)

eval_df = pd.DataFrame(columns=['minute_id', 'system', 'feature_combo', 'reference_id', 'hit@1', 'hit@10', 'mrr'])

for i, system in enumerate(SYSTEM):
    for minute_id in valid['minute_id'].unique():
        # Get the baseline annotations for this minute_id
        baseline_annotations = baseline[baseline['minute_id'] == minute_id]

        system_results = valid[(valid['minute_id'] == minute_id) & (valid['system'] == system)]

        # Set the variables for evaluation
        baseline_used = set()

        for _, system_row in system_results.iterrows():
            best_match = None
            best_match_hash = None

            # Check each system result against the baseline annotations
            # If a match is found, it is a True Positive (TP)
            # If no match is found, it is a False Positive (FP)
            for _, baseline_row in baseline_annotations.iterrows():
                # See whether there is a match based on the panoptic overlap
                sentence_match = panoptic_overlap_match(system_row['sentence'], baseline_row['sentence'])
                text_match = panoptic_overlap_match(system_row['text'], baseline_row['text'])

                if sentence_match and text_match and baseline_row.to_string() not in baseline_used:
                    best_match = baseline_row
                    best_match_hash = baseline_row.to_string()
                    break

            if best_match is not None:
                # If a match is found, we loop over the feature combinations generated
                # by the system and calculate the hit@1 and hit@10 metrics
                baseline_used.add(best_match_hash)
                all_candidates_for_reference = valid_candidates[
                    (valid_candidates['minute_id'] == minute_id) &
                    (valid_candidates['system'] == system) &
                    (valid_candidates['reference_id'] == system_row['reference_id'])
                ]

                for feature_combo in FEATURE_COMBOS:
                    candidates = all_candidates_for_reference[
                        all_candidates_for_reference['combo'] == feature_combo
                    ].copy()

                    # Calculate hit@1 and hit@10
                    hit_at_1 = 0
                    hit_at_10 = 0
                    mrr = 0.0

                    # Sort the candidates by similarity score
                    candidates = candidates.sort_values(by='similarity_score', ascending=False)
                    candidates = candidates.reset_index(drop=True)

                    for idx, candidate in candidates.iterrows():
                        if ((best_match['identifier_a'] not in (None, '', 'nan', np.nan) and best_match['identifier_a'] in candidate['id']) or \
                            (best_match['identifier_b'] not in (None, '', 'nan', np.nan) and best_match['identifier_b'] in candidate['id'])):
                            if idx == 0:
                                hit_at_1 = 1
                            if idx < 10:
                                hit_at_10 = 1
                            mrr += 1 / (idx + 1)
                            break


                    eval_df.loc[len(eval_df)] = [minute_id, NEW_SYSTEMS[i], feature_combo, system_row['reference_id'], hit_at_1, hit_at_10, mrr]

eval_df.head()


Unnamed: 0,minute_id,system,feature_combo,reference_id,hit@1,hit@10,mrr
0,h-tk-20182019-35-8-n1,zeroshot-two-pass,text,3,0,0,0.0
1,h-tk-20182019-35-8-n1,zeroshot-two-pass,text+sentence,3,0,0,0.0
2,h-tk-20182019-35-8-n1,zeroshot-two-pass,text+sentence+keywords,3,0,0,0.0
3,h-tk-20182019-35-8-n1,zeroshot-two-pass,text+sentence+summary,3,0,0,0.0
4,h-tk-20182019-35-8-n1,zeroshot-two-pass,text+sentence+keywords,3,0,0,0.0


In [8]:
# Add confidence intervals
import scipy.stats as stats

# Sum the results for each system and feature combination
summary_df = eval_df.groupby(['system', 'feature_combo']).agg({
    'hit@1': 'mean',
    'hit@10': 'mean',
    'mrr': 'mean',
}).reset_index()


# In the summary_df, calculate the confidence intervals (+-) given the hit@1 and hit@10 values and the following vars:
#  n_fewshot_two_pass = 105
#  n_zeroshot_two_pass = 81
n_fewshot_two_pass = 105
n_zeroshot_two_pass = 89

def apply_confidence_intervals(row):
    if row['system'] == 'fewshot-two-pass':
        n = n_fewshot_two_pass
    elif row['system'] == 'zeroshot-two-pass':
        n = n_zeroshot_two_pass
    else:
        raise ValueError(f"Unknown system: {row['system']}")

    # Use the following formula to calculate the confidence interval:
    # CI = z * (σ / √n)
    z = stats.norm.ppf(0.975)  # 95% confidence interval
    hit1_std = row['hit@1'] * (1 - row['hit@1'])  # Approximation of standard deviation for proportion
    hit10_std = row['hit@10'] * (1 - row['hit@10'])
    # Use t-distribution for more accurate CIs
    t_value = stats.t.ppf(0.975, df=n-1)  # 95% CI

    row['hit@1'] = f"{row['hit@1']:.2f} ± {z * (hit1_std / (n ** 0.5)):.2f}"
    row['hit@10'] = f"{row['hit@10']:.2f} ± {z * (hit10_std / (n ** 0.5)):.2f}"
    return row

summary_df = summary_df.apply(apply_confidence_intervals, axis=1)




# Rename the columns for clarity
summary_df.rename(columns={'hit@1': 'Hit@1 (95% CI)', 'hit@10': 'Hit@10 (95% CI)'}, inplace=True)


# Print the summary DataFrame in markdown format
print(f"Summary of Hit@1 and Hit@10 for each system and feature combination (fewshot-two-pass N = {n_fewshot_two_pass}, zeroshot-two-pass N = {n_zeroshot_two_pass}):")
print(summary_df.to_markdown(index=False, tablefmt='pipe', floatfmt=".2f"))

Summary of Hit@1 and Hit@10 for each system and feature combination (fewshot-two-pass N = 105, zeroshot-two-pass N = 89):
| system            | feature_combo                               | Hit@1 (95% CI)   | Hit@10 (95% CI)   |   mrr |
|:------------------|:--------------------------------------------|:-----------------|:------------------|------:|
| fewshot-two-pass  | text                                        | 0.00 ± 0.00      | 0.09 ± 0.01       |  0.02 |
| fewshot-two-pass  | text+sentence                               | 0.13 ± 0.02      | 0.22 ± 0.03       |  0.16 |
| fewshot-two-pass  | text+sentence+keywords                      | 0.13 ± 0.02      | 0.26 ± 0.04       |  0.17 |
| fewshot-two-pass  | text+sentence+keywords+summary              | 0.13 ± 0.02      | 0.22 ± 0.03       |  0.16 |
| fewshot-two-pass  | text+sentence+keywords+summary-doctype      | 0.20 ± 0.03      | 0.33 ± 0.04       |  0.24 |
| fewshot-two-pass  | text+sentence+keywords+summary-year         | 0.18 

|    | system            | feature_combo                               |     hit@1 |    hit@10 |
|---:|:------------------|:--------------------------------------------|----------:|----------:|
|  0 | fewshot-two-pass  | text                                        | 0         | 0.0380952 |
|  1 | fewshot-two-pass  | text+sentence                               | 0.0857143 | 0.161905  |
|  2 | fewshot-two-pass  | text+sentence+keywords                      | 0.0761905 | 0.180952  |
|  3 | fewshot-two-pass  | text+sentence+keywords+summary              | 0.0857143 | 0.161905  |
|  4 | fewshot-two-pass  | text+sentence+keywords+summary-doctype      | 0.12381   | 0.285714  |
|  5 | fewshot-two-pass  | text+sentence+keywords+summary-year         | 0.114286  | 0.342857  |
|  6 | fewshot-two-pass  | text+sentence+keywords+summary-year+doctype | 0.190476  | 0.47619   |
|  7 | fewshot-two-pass  | text+sentence+keywords-doctype              | 0.12381   | 0.295238  |
|  8 | fewshot-two-pass  | text+sentence+keywords-year                 | 0.133333  | 0.371429  |
|  9 | fewshot-two-pass  | text+sentence+keywords-year+doctype         | 0.2       | 0.466667  |
| 10 | fewshot-two-pass  | text+sentence+summary                       | 0.0761905 | 0.190476  |
| 11 | zeroshot-two-pass | text                                        | 0.037037  | 0.111111  |
| 12 | zeroshot-two-pass | text+sentence                               | 0.17284   | 0.246914  |
| 13 | zeroshot-two-pass | text+sentence+keywords                      | 0.17284   | 0.296296  |
| 14 | zeroshot-two-pass | text+sentence+keywords+summary              | 0.17284   | 0.246914  |
| 15 | zeroshot-two-pass | text+sentence+keywords+summary-doctype      | 0.185185  | 0.432099  |
| 16 | zeroshot-two-pass | text+sentence+keywords+summary-year         | 0.246914  | 0.469136  |
| 17 | zeroshot-two-pass | text+sentence+keywords+summary-year+doctype | 0.234568  | 0.493827  |
| 18 | zeroshot-two-pass | text+sentence+keywords-doctype              | 0.197531  | 0.395062  |
| 19 | zeroshot-two-pass | text+sentence+keywords-year                 | 0.234568  | 0.469136  |
| 20 | zeroshot-two-pass | text+sentence+keywords-year+doctype         | 0.259259  | 0.506173  |
| 21 | zeroshot-two-pass | text+sentence+summary                       | 0.148148  | 0.333333  |