#### This notebook shows results from Table 7

In [1]:
import pandas as pd

def calculate_metrics(gold_csv, pred_csv):
    """
    Computes precision, recall, F1 score, and F5 score for each type and overall between two CSV files.

    Parameters:
    - gold_csv: Path to the gold standard CSV file.
    - pred_csv: Path to the predicted annotations CSV file.

    Returns:
    - A pandas DataFrame with precision, recall, F1 score, and F5 score for each type and overall.
    """
    # Read CSV files
    gold_df = pd.read_csv(gold_csv)
    pred_df = pd.read_csv(pred_csv)

    # Mapping of Presidio-detected entity types to true entity types
    entity_type_mapping_pre = {
        "PERSON": "NAME_STUDENT",
        "EMAIL_ADDRESS": "EMAIL",
        "URL": "URL_PERSONAL",
        "PHONE_NUMBER": "PHONE_NUM"
    }

    # Mapping of Azure-detected entity types to true entity types
    entity_type_mapping_az = {
        "Person": "NAME_STUDENT",
        "Email": "EMAIL",
        "URL": "URL_PERSONAL",
        "PhoneNumber": "PHONE_NUM"
    }

    # Apply the mapping to standardize the entity types in pred_df
    if 'azure' in pred_csv:
        pred_df['type'] = pred_df['type'].map(entity_type_mapping_az).fillna(pred_df['type'])
    else:
        pred_df['type'] = pred_df['type'].map(entity_type_mapping_pre).fillna(pred_df['type'])

    # Define the types
    types = ['NAME_STUDENT', 'URL_PERSONAL', 'EMAIL', 'PHONE_NUM']

    # Initialize counts for each type and overall
    counts = {}
    for entity_type in types + ['Overall']:
        counts[entity_type] = {'TP': 0, 'FP': 0, 'FN': 0}

    # Compute counts for each type
    for entity_type in types:
        # Filter entities by type
        gold_entities = gold_df[gold_df['type'] == entity_type]
        pred_entities = pred_df[pred_df['type'] == entity_type]

        # Initialize matched indices
        matched_gold_indices = set()
        
        # Group by file_idx for efficient matching
        gold_grouped = gold_entities.groupby('file_idx')
        pred_grouped = pred_entities.groupby('file_idx')

        # Get all file indices present in either gold or pred
        all_file_indices = set(gold_entities['file_idx']).union(set(pred_entities['file_idx']))

        for file_idx in all_file_indices:
            gold_file_entities = gold_grouped.get_group(file_idx) if file_idx in gold_grouped.groups else pd.DataFrame()
            pred_file_entities = pred_grouped.get_group(file_idx) if file_idx in pred_grouped.groups else pd.DataFrame()

            # Reset indices to ensure unique identification within this loop
            gold_file_entities = gold_file_entities.reset_index()
            pred_file_entities = pred_file_entities.reset_index()

            # Create lists of gold and pred entities for this file_idx
            gold_list = gold_file_entities.to_dict('records')
            pred_list = pred_file_entities.to_dict('records')

            # Match predicted entities to gold entities
            for pred_entity in pred_list:
                pred_text = pred_entity['entity_text']
                pred_type = pred_entity['type']
                
                for gold_entity in gold_list:
                    gold_text = gold_entity['entity_text']
                    gold_type = gold_entity['type']
                    
                    # Use the correct global unique identifier from the original 'index' column
                    gold_global_idx = gold_entity['index'] 

                    # Check if this gold entity has already been matched
                    if gold_global_idx in matched_gold_indices:
                        continue

                    # Check for matching criteria
                    if (pred_text == gold_text and pred_type == gold_type):
                        # Match found
                        matched_gold_indices.add(gold_global_idx)
                        break  # Move to next predicted entity

        # Compute counts
        TP = len(matched_gold_indices)
        FP = len(pred_entities) - TP
        FN = len(gold_entities) - TP

        # Update counts
        counts[entity_type]['TP'] = TP
        counts[entity_type]['FP'] = FP
        counts[entity_type]['FN'] = FN

    # Compute overall counts
    for metric in ['TP', 'FP', 'FN']:
        counts['Overall'][metric] = sum(counts[entity_type][metric] for entity_type in types)

    # Prepare results
    results = []
    for entity_type in types + ['Overall']:
        TP = counts[entity_type]['TP']
        FP = counts[entity_type]['FP']
        FN = counts[entity_type]['FN']

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        F5 = (1 + 5**2) * (precision * recall) / ((5**2 * precision) + recall) if ((5**2 * precision) + recall) > 0 else 0

        results.append({
            'Entity Type': entity_type,
            'TP': TP,
            'FP': FP,
            'FN': FN,
            'Precision': round(precision, 4),
            'Recall': round(recall, 4),
            'F1 Score': round(F1, 4),
            'F5 Score': round(F5, 4)
        })

    # Create DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df


In [2]:
# Define the true data path and a dictionary mapping model names to their prediction files
true_file_path = 'data/test_set.csv'
models_to_evaluate = {
    "1. Presidio (en_core_web_lg)": "output/1_presidio_lg.csv",
    "2. Presidio (en_core_web_trf)": "output/2_presidio_trf.csv",
    "3. Azure AI Language": "output/3_azure.csv",
    "4. Prompting GPT-4o-mini": "output/4_prompting.csv",
    "5. Fine-tuned GPT-4o-mini": "output/5_finetuned.csv",
    "6. Verifier Model I (Without CoT)": "output/6_verifier_I.csv",
    "7. Verifier Model II (With CoT)": "output/7_verifier_II.csv"
}

# Loop through the models, calculate metrics, and print the results
for model_name, detected_file_path in models_to_evaluate.items():
    print(f"{model_name}:")
    metrics_df = calculate_metrics(true_file_path, detected_file_path)
    print(metrics_df)
    print("\n" + "="*80 + "\n")

1. Presidio (en_core_web_lg):
    Entity Type    TP     FP   FN  Precision  Recall  F1 Score  F5 Score
0  NAME_STUDENT  1805   9294  805     0.1626  0.6916    0.2633    0.6147
1  URL_PERSONAL   181   2256   31     0.0743  0.8538    0.1367    0.6082
2         EMAIL    61     10    1     0.8592  0.9839    0.9173    0.9784
3     PHONE_NUM     8     37    1     0.1778  0.8889    0.2963    0.7704
4       Overall  2055  11597  838     0.1505  0.7103    0.2484    0.6214


2. Presidio (en_core_web_trf):
    Entity Type    TP    FP   FN  Precision  Recall  F1 Score  F5 Score
0  NAME_STUDENT  2172  6849  438     0.2408  0.8322    0.3735    0.7604
1  URL_PERSONAL   180  2257   32     0.0739  0.8491    0.1359    0.6049
2         EMAIL    61    10    1     0.8592  0.9839    0.9173    0.9784
3     PHONE_NUM     8    37    1     0.1778  0.8889    0.2963    0.7704
4       Overall  2421  9153  472     0.2092  0.8368    0.3347    0.7503


3. Azure AI Language:
    Entity Type    TP    FP   FN  Precision