# SOAP Note Evaluation Analysis

This notebook loads and analyzes the evaluation results from the SOAP note evaluation suite.


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# Add parent directory to path to import src modules if needed
sys.path.insert(0, str(Path().resolve().parent))

# Load results
results_dir = Path("../results")
per_note_path = results_dir / "per_note.jsonl"
summary_path = results_dir / "summary.json"

# Load per-note results
results = []
with open(per_note_path, "r") as f:
    for line in f:
        results.append(json.loads(line))

# Load summary
with open(summary_path, "r") as f:
    summary = json.load(f)

print(f"Loaded {len(results)} evaluation results")
print(f"\nSummary:\n{json.dumps(summary, indent=2)}")


## Histogram of Overall Quality Scores


In [None]:
overall_scores = [r["scores"]["overall_quality"] for r in results]

plt.figure(figsize=(10, 6))
plt.hist(overall_scores, bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Overall Quality Score")
plt.ylabel("Frequency")
plt.title("Distribution of Overall Quality Scores")
plt.grid(True, alpha=0.3)
plt.show()

print(f"Mean: {pd.Series(overall_scores).mean():.3f}")
print(f"Std: {pd.Series(overall_scores).std():.3f}")
print(f"Min: {min(overall_scores):.3f}")
print(f"Max: {max(overall_scores):.3f}")


## Coverage vs Faithfulness Scatter Plot


In [None]:
coverage_scores = [r["scores"]["coverage"] for r in results]
faithfulness_scores = [r["scores"]["faithfulness"] for r in results]

plt.figure(figsize=(10, 6))
plt.scatter(coverage_scores, faithfulness_scores, alpha=0.6)
plt.xlabel("Coverage Score")
plt.ylabel("Faithfulness Score")
plt.title("Coverage vs Faithfulness")
plt.grid(True, alpha=0.3)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

# Compute correlation
correlation = pd.Series(coverage_scores).corr(pd.Series(faithfulness_scores))
print(f"Correlation: {correlation:.3f}")


## Worst 5 Notes by Overall Quality


In [None]:
# Sort by overall quality (ascending)
sorted_results = sorted(results, key=lambda x: x["scores"]["overall_quality"])
worst_5 = sorted_results[:5]

print("=" * 80)
for i, result in enumerate(worst_5, 1):
    print(f"\n{i}. Example ID: {result['example_id']}")
    print(f"   Overall Quality: {result['scores']['overall_quality']:.3f}")
    print(f"   Coverage: {result['scores']['coverage']:.3f}")
    print(f"   Faithfulness: {result['scores']['faithfulness']:.3f}")
    print(f"   Accuracy: {result['scores']['accuracy']:.3f}")
    print(f"   Number of Issues: {len(result['issues'])}")
    
    if result['issues']:
        print("   Issues:")
        for issue in result['issues']:
            print(f"     - [{issue['severity'].upper()}] {issue['category']}: {issue['description']}")
            if issue.get('span_model'):
                print(f"       Model span: {issue['span_model'][:100]}...")
            if issue.get('span_source'):
                print(f"       Source span: {issue['span_source'][:100]}...")
    print("-" * 80)


## Issue Category Distribution


In [None]:
# Count issues by category
issue_counts = {"missing_critical": 0, "hallucination": 0, "clinical_inaccuracy": 0}
for result in results:
    for issue in result["issues"]:
        issue_counts[issue["category"]] = issue_counts.get(issue["category"], 0) + 1

plt.figure(figsize=(10, 6))
categories = list(issue_counts.keys())
counts = list(issue_counts.values())
plt.bar(categories, counts, color=["#ff6b6b", "#4ecdc4", "#ffe66d"], edgecolor="black")
plt.xlabel("Issue Category")
plt.ylabel("Count")
plt.title("Distribution of Issues by Category")
plt.xticks(rotation=45, ha="right")
plt.grid(True, alpha=0.3, axis="y")
plt.show()

for category, count in issue_counts.items():
    print(f"{category}: {count}")
