In [8]:
import json
import statistics
import os

def extract_metrics(data):
    metrics = {}
    for experiment in data["GroundTruthV2"].keys():
        if experiment.startswith("1ShotLlama3KeyphrasesKMeans"):
            exp_data = data["GroundTruthV2"][experiment]
            exact_p = exp_data['Exact-Matching']['mean'].get('Precision', 0)
            exact_r = exp_data['Exact-Matching']['mean'].get('Weighted Recall', 0)
            exact_f1 = exp_data['Exact-Matching']['mean'].get('F1 Score', 0)
            embedding_p = exp_data['Embedding-Based']['mean'].get('Precision', 0)
            embedding_r = exp_data['Embedding-Based']['mean'].get('Weighted Recall', 0)
            embedding_f1 = exp_data['Embedding-Based']['mean'].get('F1 Score', 0)
            diversity = exp_data['Diversity']['Diversity']
            metrics[experiment] = {
                'Exact-Matching P': exact_p,
                'Exact-Matching R': exact_r,
                'Exact-Matching F1': exact_f1,
                'Embedding-Based P': embedding_p,
                'Embedding-Based R': embedding_r,
                'Embedding-Based F1': embedding_f1,
                'Diversity': diversity
            }
    return metrics

def compute_stats(metrics):
    aggregated_metrics = {
        'Exact-Matching P': [],
        'Exact-Matching R': [],
        'Exact-Matching F1': [],
        'Embedding-Based P': [],
        'Embedding-Based R': [],
        'Embedding-Based F1': [],
        'Diversity': []
    }

    for experiment_metrics in metrics.values():
        for key in aggregated_metrics:
            aggregated_metrics[key].append(experiment_metrics[key])

    for metric, values in aggregated_metrics.items():
        mean = statistics.mean(values)
        std_dev = statistics.stdev(values) if len(values) > 1 else 0
        print(f"{metric}:")
        print(f"  Mean: {mean:.4f}")
        print(f"  Std Dev: {std_dev:.4f}")
        print()

# Load the JSON file
filename = "evaluation_results.json"  # Replace with your actual filename
with open(filename, 'r') as f:
    data = json.load(f)

# Extract metrics from the JSON data
metrics = extract_metrics(data)

# Compute and print statistics
compute_stats(metrics)

Exact-Matching P:
  Mean: 0.2514
  Std Dev: 0.0358

Exact-Matching R:
  Mean: 0.2146
  Std Dev: 0.0398

Exact-Matching F1:
  Mean: 0.2189
  Std Dev: 0.0426

Embedding-Based P:
  Mean: 0.5772
  Std Dev: 0.0330

Embedding-Based R:
  Mean: 0.5537
  Std Dev: 0.0471

Embedding-Based F1:
  Mean: 0.5600
  Std Dev: 0.0415

Diversity:
  Mean: 0.3378
  Std Dev: 0.0247

