In [None]:
# Google Colab Notebook: LLM Evaluation for Hierarchical Medical Ontology
# Upload your files: test.jsonl, entity_lexicon.json, answers.txt to the same folder

# Install required packages

import json
import random
import os
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from typing import List, Dict, Tuple
from google.colab import files

def upload_files():
    """Upload files to Colab"""
    print("Please upload the following files:")
    print("1. test.jsonl")
    print("2. entity_lexicon.json")
    print("3. answers.txt")
    print()
    uploaded = files.upload()
    return uploaded

def load_answers_from_file(file_path: str) -> List[int]:
    """Load answers from answers.txt file"""
    answers = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.endswith('Yes'):
                answers.append(1)
            elif line.endswith('No'):
                answers.append(0)
    return answers

def generate_ground_truth(entity_lexicon_path: str, test_jsonl_path: str, max_questions: int = 500) -> Tuple[List[str], List[int]]:
    """Generate the same questions and ground truth as the original script"""

    # Load data
    with open(entity_lexicon_path, 'r') as f:
        entity_lexicon = json.load(f)

    test_examples = []
    with open(test_jsonl_path, 'r') as f:
        for line in f:
            test_examples.append(json.loads(line))

    questions = []
    labels = []

    for example in test_examples:
        child_id = example["child"]
        parent_id = example["parent"]

        # Skip if entities not in lexicon
        if child_id not in entity_lexicon or parent_id not in entity_lexicon:
            continue

        child_name = entity_lexicon[child_id]["name"]
        parent_name = entity_lexicon[parent_id]["name"]

        # Positive example (child -> parent)
        question = f'Is "{child_name}" a subtype/subclass of "{parent_name}"?'
        questions.append(question)
        labels.append(1)  # Positive relationship

        # Use all 10 negatives to match your actual test data
        negatives = example.get("random_negatives", [])[:10]  # All 10 negatives

        for neg_id in negatives:
            if neg_id in entity_lexicon:
                neg_name = entity_lexicon[neg_id]["name"]
                neg_question = f'Is "{child_name}" a subtype/subclass of "{neg_name}"?'
                questions.append(neg_question)
                labels.append(0)  # Negative relationship

    # Shuffle to avoid patterns - use same seed for reproducibility
    random.seed(42)
    combined = list(zip(questions, labels))
    random.shuffle(combined)
    questions, labels = zip(*combined)

    # Limit questions if specified
    if max_questions and len(questions) > max_questions:
        questions = questions[:max_questions]
        labels = labels[:max_questions]

    return list(questions), list(labels)

def evaluate_llm_answers():
    """Main evaluation function"""

    # Check if files exist, if not upload them
    required_files = ["test.jsonl", "entity_lexicon.json", "answers.txt"]
    missing_files = [f for f in required_files if not os.path.exists(f)]

    if missing_files:
        print(f"Missing files: {missing_files}")
        print("Please upload the required files:")
        upload_files()

    # File paths (in current directory)
    entity_lexicon_path = "entity_lexicon.json"
    test_jsonl_path = "test.jsonl"
    answers_file = "answers.txt"

    # Verify files exist
    for file_path, name in [(entity_lexicon_path, "entity_lexicon.json"),
                           (test_jsonl_path, "test.jsonl"),
                           (answers_file, "answers.txt")]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {name}. Please upload it first.")

    # Load your answers
    print("Loading LLM answers...")
    llm_predictions = load_answers_from_file(answers_file)
    print(f"Loaded {len(llm_predictions)} answers")

    # Generate ground truth (same order as original questions)
    print("Generating ground truth...")
    questions, ground_truth = generate_ground_truth(entity_lexicon_path, test_jsonl_path, max_questions=500)
    print(f"Generated {len(ground_truth)} ground truth labels")

    # Ensure same length
    min_len = min(len(llm_predictions), len(ground_truth))
    predictions = llm_predictions[:min_len]
    truth = ground_truth[:min_len]

    print(f"Evaluating on {min_len} questions...")

    if len(predictions) == 0:
        raise ValueError("No valid predictions found. Check your answers.txt format.")

    # Calculate metrics
    metrics = {
        "total_questions": len(truth),
        "answered_questions": len(predictions),
        "f1_score": f1_score(truth, predictions),
        "precision": precision_score(truth, predictions, zero_division=0),
        "recall": recall_score(truth, predictions, zero_division=0),
        "accuracy": accuracy_score(truth, predictions),
    }

    # Additional analysis
    positive_examples = sum(truth)
    negative_examples = len(truth) - positive_examples

    metrics.update({
        "positive_examples": positive_examples,
        "negative_examples": negative_examples,
        "predicted_positive": sum(predictions),
        "predicted_negative": len(predictions) - sum(predictions),
    })

    # Print results
    print("="*60)
    print("LLM EVALUATION RESULTS - DOID-MIXED DATASET")
    print("="*60)
    print(f"Total Questions: {metrics['total_questions']}")
    print(f"Answered Questions: {metrics['answered_questions']}")
    print(f"Answer Rate: {metrics['answered_questions']/metrics['total_questions']*100:.1f}%")
    print()
    print("PERFORMANCE METRICS:")
    print(f"F1 Score:    {metrics['f1_score']:.4f}")
    print(f"Precision:   {metrics['precision']:.4f}")
    print(f"Recall:      {metrics['recall']:.4f}")
    print(f"Accuracy:    {metrics['accuracy']:.4f}")
    print()
    print("DATA DISTRIBUTION:")
    print(f"Positive Examples: {metrics['positive_examples']} ({metrics['positive_examples']/metrics['total_questions']*100:.1f}%)")
    print(f"Negative Examples: {metrics['negative_examples']} ({metrics['negative_examples']/metrics['total_questions']*100:.1f}%)")
    print()
    print("PREDICTION DISTRIBUTION:")
    print(f"Predicted Positive: {metrics['predicted_positive']}")
    print(f"Predicted Negative: {metrics['predicted_negative']}")
    print("="*60)

    # Show some example misclassifications
    print("\nSAMPLE ANALYSIS:")
    print("-" * 40)

    # Find some misclassified examples
    misclassified = []
    for i, (pred, true) in enumerate(zip(predictions, truth)):
        if pred != true and len(misclassified) < 5:
            question = questions[i] if i < len(questions) else f"Question {i+1}"
            misclassified.append((question, true, pred))

    if misclassified:
        print("Sample Misclassified Examples:")
        for question, true_label, pred_label in misclassified:
            true_str = "Yes" if true_label == 1 else "No"
            pred_str = "Yes" if pred_label == 1 else "No"
            print(f"Q: {question}")
            print(f"   Ground Truth: {true_str}, Predicted: {pred_str}")
            print()

    # Calculate confusion matrix manually
    tp = sum(1 for p, t in zip(predictions, truth) if p == 1 and t == 1)
    fp = sum(1 for p, t in zip(predictions, truth) if p == 1 and t == 0)
    tn = sum(1 for p, t in zip(predictions, truth) if p == 0 and t == 0)
    fn = sum(1 for p, t in zip(predictions, truth) if p == 0 and t == 1)

    print("CONFUSION MATRIX:")
    print(f"True Positives (TP):  {tp}")
    print(f"False Positives (FP): {fp}")
    print(f"True Negatives (TN):  {tn}")
    print(f"False Negatives (FN): {fn}")
    print("="*60)

    return metrics

def show_data_info():
    """Show information about the loaded data"""
    if not all(os.path.exists(f) for f in ["test.jsonl", "entity_lexicon.json", "answers.txt"]):
        print("Please run evaluate_llm_answers() first to load the data.")
        return

    # Load and show basic info
    with open("entity_lexicon.json", 'r') as f:
        entity_lexicon = json.load(f)

    test_examples = []
    with open("test.jsonl", 'r') as f:
        for line in f:
            test_examples.append(json.loads(line))

    print("DATASET INFORMATION:")
    print("="*50)
    print(f"Total entities in lexicon: {len(entity_lexicon)}")
    print(f"Total test examples: {len(test_examples)}")

    # Show sample entities
    sample_entities = list(entity_lexicon.items())[:5]
    print("\nSample entities:")
    for uri, data in sample_entities:
        print(f"  {data.get('name', 'Unknown')}")

    # Show sample test example
    if test_examples:
        example = test_examples[0]
        child_name = entity_lexicon.get(example['child'], {}).get('name', 'Unknown')
        parent_name = entity_lexicon.get(example['parent'], {}).get('name', 'Unknown')
        print(f"\nSample relationship:")
        print(f"  Child: {child_name}")
        print(f"  Parent: {parent_name}")
        print(f"  Random negatives: {len(example.get('random_negatives', []))}")
        print(f"  Hard negatives: {len(example.get('hard_negatives', []))}")

# Main execution
print("ðŸ”¬ LLM Medical Ontology Evaluation Tool")
print("=" * 50)
print("This notebook evaluates LLM performance on hierarchical medical ontology relationships.")
print("\nTo use:")
print("1. Upload your files: test.jsonl, entity_lexicon.json, answers.txt")
print("2. Run evaluate_llm_answers() to get evaluation results")
print("3. Run show_data_info() to see dataset information")
print("\nReady to start! Run the functions below:")

# Uncomment the lines below to run automatically:
# evaluate_llm_answers()
# show_data_info()

ðŸ”¬ LLM Medical Ontology Evaluation Tool
This notebook evaluates LLM performance on hierarchical medical ontology relationships.

To use:
1. Upload your files: test.jsonl, entity_lexicon.json, answers.txt
2. Run evaluate_llm_answers() to get evaluation results
3. Run show_data_info() to see dataset information

Ready to start! Run the functions below:


In [None]:
evaluate_llm_answers()
show_data_info()

Loading LLM answers...
Loaded 500 answers
Generating ground truth...
Generated 500 ground truth labels
Evaluating on 500 questions...
LLM EVALUATION RESULTS - DOID-MIXED DATASET
Total Questions: 500
Answered Questions: 500
Answer Rate: 100.0%

PERFORMANCE METRICS:
F1 Score:    0.7500
Precision:   0.6964
Recall:      0.8125
Accuracy:    0.9480

DATA DISTRIBUTION:
Positive Examples: 48 (9.6%)
Negative Examples: 452 (90.4%)

PREDICTION DISTRIBUTION:
Predicted Positive: 56
Predicted Negative: 444

SAMPLE ANALYSIS:
----------------------------------------
Sample Misclassified Examples:
Q: Is "shopping" a subtype/subclass of "score"?
   Ground Truth: No, Predicted: Yes

Q: Is "mat" a subtype/subclass of "branchiostegidae"?
   Ground Truth: No, Predicted: Yes

Q: Is "horsemeat" a subtype/subclass of "solid"?
   Ground Truth: Yes, Predicted: No

Q: Is "corer" a subtype/subclass of "war party"?
   Ground Truth: No, Predicted: Yes

Q: Is "shagginess" a subtype/subclass of "pledge"?
   Ground Tru