In [9]:
import os
from typing import Any, Mapping
import asyncio
from dataclasses import dataclass
from pathlib import Path
import subprocess
import shutil

from aviary.env import TaskDataset
from ldp.agent import SimpleAgent
from ldp.alg.callbacks import MeanMetricsCallback
from ldp.alg.runners import Evaluator, EvaluatorConfig
from paperqa import Settings
from paperqa.agents.task import TASK_DATASET_NAME, LitQAv2TaskSplit
from paperqa.settings import AgentSettings, IndexSettings
from paperqa.litqa import (
    read_litqa_v2_from_hub,
    DEFAULT_LABBENCH_HF_HUB_NAME,
    DEFAULT_AVIARY_PAPER_HF_HUB_NAME,
)
import pandas as pd

In [18]:
from datasets import load_dataset


# Load train/eval dataset
train_eval_data = load_dataset("futurehouse/lab-bench", "LitQA2")["train"]

# Look at first question from test set
print("Sample question:")
print(f"Question: {train_eval_data[0]['question']}")
print(f"Correct Answer: {train_eval_data[0]['ideal']}")
print(f"Distractors: {train_eval_data[0]['distractors']}")
print(f"Sources: {train_eval_data[0]['sources']}")

Generating train split: 100%|██████████| 199/199 [00:00<00:00, 3764.44 examples/s]

Sample question:
Question: Acinetobacter lwoffii has been evolved in the lab to be resistant to which of these antibiotics?
Correct Answer: ciproflaxin
Distractors: ['meropenem', 'gentamicin', 'ampicillin']
Sources: ['https://doi.org/10.1128/msphere.00109-24']





In [10]:
@dataclass
class ModelConfig:
    name: str
    llm_model: str  # For generate answer
    summary_llm_model: str | None = None  # For RCS, None means no RCS
    search_count: int = 12
    top_k: int = 30
    agent_evidence_n: int = 15

In [11]:
async def setup_evaluation_resources(
    paper_directory: Path,
    cache_dir: Path | None = None,
) -> tuple[Path, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Setup resources needed for evaluation.
    
    Args:
        paper_directory: Directory where papers will be stored
        cache_dir: Optional cache directory for dataset downloads
    
    Returns:
        Tuple of (paper directory path, train DataFrame, eval DataFrame, test DataFrame)
    """
    # Create directories if they don't exist
    paper_directory.mkdir(parents=True, exist_ok=True)
    if cache_dir:
        cache_dir.mkdir(parents=True, exist_ok=True)

    # Load datasets
    print("Loading LitQA2 datasets...")
    train_df, eval_df, test_df = read_litqa_v2_from_hub(
        train_eval_dataset=DEFAULT_LABBENCH_HF_HUB_NAME,
        test_dataset=DEFAULT_AVIARY_PAPER_HF_HUB_NAME,
        cache_dir=cache_dir
    )
    
    # Download papers if not already present
    print("Checking/downloading required papers...")
    
    # Get all unique DOIs from datasets
    all_sources = set()
    for df in [train_df, eval_df, test_df]:
        for sources in df['sources']:
            all_sources.update(sources)
    
    # Here you would implement paper downloading logic
    # This is a placeholder - you'll need to implement actual paper downloading
    # based on your institution's access and legal requirements
    for source in all_sources:
        target_file = paper_directory / f"{source}.pdf"
        if not target_file.exists():
            print(f"Need to acquire paper: {source}")
    
    return paper_directory, train_df, eval_df, test_df


In [12]:
async def evaluate_model(
    config: ModelConfig,
    paper_directory: str | os.PathLike,
    split: str = LitQAv2TaskSplit.EVAL,
) -> dict[str, float]:
    """Run evaluation for a specific model configuration."""
    
    # Configure settings
    agent_settings = AgentSettings(
        search_count=config.search_count,
        top_k=config.top_k,
        agent_evidence_n=config.agent_evidence_n,
        index=IndexSettings(paper_directory=paper_directory),
    )
    
    settings = Settings(
        agent=agent_settings,
        llm_model=config.llm_model,
        summary_llm_model=config.summary_llm_model if config.summary_llm_model else config.llm_model,
    )

    # Create dataset and evaluation setup
    dataset = TaskDataset.from_name(
        TASK_DATASET_NAME,
        settings=settings,
        split=split,
    )
    metrics_callback = MeanMetricsCallback(eval_dataset=dataset)
    
    evaluator = Evaluator(
        config=EvaluatorConfig(batch_size=3),
        agent=SimpleAgent(),
        dataset=dataset,
        callbacks=[metrics_callback],
    )
    
    # Run evaluation
    await evaluator.evaluate()
    return metrics_callback.eval_means


In [13]:
async def run_evaluations(
    paper_directory: str | os.PathLike,
    cache_dir: Path | None = None
) -> None:
    """Run evaluations for different model configurations."""
    
    # Setup resources first
    paper_dir, train_df, eval_df, test_df = await setup_evaluation_resources(
        Path(paper_directory),
        cache_dir
    )
    
    print(f"Loaded dataset splits:")
    print(f"Train: {len(train_df)} questions")
    print(f"Eval: {len(eval_df)} questions")
    print(f"Test: {len(test_df)} questions")
    
    configs = [
        # Base models without RCS
        ModelConfig(
            name="No RCS",
            llm_model="gpt-4-0125-preview",
            summary_llm_model=None,
        ),
        
        # Different models with RCS
        ModelConfig(
            name="GPT-4 Turbo",
            llm_model="gpt-4-0125-preview",
            summary_llm_model="gpt-4-0125-preview",
        ),
        ModelConfig(
            name="Claude-3-Opus",
            llm_model="claude-3-opus-20240229",
            summary_llm_model="claude-3-opus-20240229",
        ),
        ModelConfig(
            name="Gemini-1.5-Pro",
            llm_model="gemini-1.5-pro",
            summary_llm_model="gemini-1.5-pro",
        ),
        
        # Ablation studies
        ModelConfig(
            name="Evidence@5",
            llm_model="gpt-4-0125-preview",
            summary_llm_model="gpt-4-0125-preview",
            agent_evidence_n=5,
        ),
        ModelConfig(
            name="Top-k@10",
            llm_model="gpt-4-0125-preview",
            summary_llm_model="gpt-4-0125-preview",
            top_k=10,
        ),
    ]
    
    results = {}
    for config in configs:
        print(f"Evaluating {config.name}...")
        metrics = await evaluate_model(config, paper_directory, cache_dir=cache_dir)
        results[config.name] = metrics
        
        # Print key metrics
        print(f"\nResults for {config.name}:")
        print(f"Accuracy: {metrics['correct']:.3f}")
        print(f"Precision: {metrics['correct'] / (1 - metrics['unsure']):.3f}")
        print(f"Average Evidence Count: {metrics['evidence_count']:.1f}")
        print("----------------------------------------")
    
    return results

In [14]:
paper_dir = Path("papers")
cache_dir = Path("cache")
results = asyncio.run(run_evaluations(paper_dir, cache_dir))

RuntimeError: asyncio.run() cannot be called from a running event loop