#### This notebook shows results from Table 11

In [1]:
import pandas as pd

def calculate_metrics(gold_csv, pred_csv):
    """
    Computes overall TP, FP, FN, Precision, Recall, F1 score, and F5 score 
    for name detection by comparing two CSV files.

    Parameters:
    - gold_csv: Path to the gold standard CSV file.
    - pred_csv: Path to the predicted annotations CSV file.

    Returns:
    - A dictionary with overall performance metrics.
    """
    # Read CSV files
    gold_df = pd.read_csv(gold_csv)
    pred_df = pd.read_csv(pred_csv)

    # Group by 'file_idx' and 'entity_text' and count occurrences
    gold_counts = gold_df.groupby(['file_idx', 'entity_text']).size().reset_index(name='gold_count')
    pred_counts = pred_df.groupby(['file_idx', 'entity_text']).size().reset_index(name='pred_count')

    # Merge counts DataFrames
    merged = pd.merge(gold_counts, pred_counts, on=['file_idx', 'entity_text'], how='outer').fillna(0)

    # Ensure counts are integers
    merged['gold_count'] = merged['gold_count'].astype(int)
    merged['pred_count'] = merged['pred_count'].astype(int)

    # Compute TP, FP, FN for each entity
    merged['TP'] = merged.apply(lambda row: min(row['gold_count'], row['pred_count']), axis=1)
    merged['FP'] = merged['pred_count'] - merged['TP']
    merged['FN'] = merged['gold_count'] - merged['TP']

    # Sum up over all entities
    TP = merged['TP'].sum()
    FP = merged['FP'].sum()
    FN = merged['FN'].sum()

    # Calculate precision, recall, F1 score, and F5 score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    F1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    F5 = (1 + 5**2) * (precision * recall) / ((5**2 * precision) + recall) if ((5**2 * precision) + recall) > 0 else 0

    # Prepare results
    results = {
        'TP': int(TP),
        'FP': int(FP),
        'FN': int(FN),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1 Score': round(F1, 4),
        'F5 Score': round(F5, 4)
    }

    return results


In [2]:
# Define the true data path and model outputs
true_file_path = 'data/new_labels_test.csv'
models_to_evaluate = {
    "1. Presidio (en_core_web_trf)": "output/1_presidio.csv",
    "2. Azure AI Language": "output/2_azure.csv",
    "3. GPT-4o-mini + Few-shot Prompting (3 shots)": "output/3_gpt+fewshot.csv",
    "4. Fine-tuned GPT-4o-mini + Zero-shot Prompting": "output/4_ftgpt+zeroshot.csv",
    "5. GPT-4o-mini + Fine-tuning (on 10 TSCC transcripts)": "output/5_gpt+ft.csv",
    "6. Fine-tuned GPT-4o-mini + Fine-tuning (on 10 TSCC transcripts)": "output/6_ftgpt+ft.csv"
}

# Evaluate and print metrics
for model_name, detected_file_path in models_to_evaluate.items():
    print(f"{model_name}:")
    metrics = calculate_metrics(true_file_path, detected_file_path)
    print(metrics)
    print("\n" + "="*120 + "\n")

1. Presidio (en_core_web_trf):
{'TP': 1513, 'FP': 2694, 'FN': 102, 'Precision': 0.3596, 'Recall': 0.9368, 'F1 Score': 0.5198, 'F5 Score': 0.8824}


2. Azure AI Language:
{'TP': 1320, 'FP': 1316, 'FN': 295, 'Precision': 0.5008, 'Recall': 0.8173, 'F1 Score': 0.621, 'F5 Score': 0.7979}


3. GPT-4o-mini + Few-shot Prompting (3 shots):
{'TP': 1604, 'FP': 641, 'FN': 11, 'Precision': 0.7145, 'Recall': 0.9932, 'F1 Score': 0.8311, 'F5 Score': 0.9785}


4. Fine-tuned GPT-4o-mini + Zero-shot Prompting:
{'TP': 1273, 'FP': 2, 'FN': 342, 'Precision': 0.9984, 'Recall': 0.7882, 'F1 Score': 0.881, 'F5 Score': 0.7947}


5. GPT-4o-mini + Fine-tuning (on 10 TSCC transcripts):
{'TP': 1561, 'FP': 26, 'FN': 54, 'Precision': 0.9836, 'Recall': 0.9666, 'F1 Score': 0.975, 'F5 Score': 0.9672}


6. Fine-tuned GPT-4o-mini + Fine-tuning (on 10 TSCC transcripts):
{'TP': 1598, 'FP': 48, 'FN': 17, 'Precision': 0.9708, 'Recall': 0.9895, 'F1 Score': 0.9801, 'F5 Score': 0.9887}


