# Summary Evaluators

## What I Learned
This focused on summary evaluators that calculate metrics like precision, recall, and F1-score across whole experiments rather than individual runs. These are helpful for aggregate performance assessment.

## Changes in Code
In the notebook, I used summary evaluators on my dataset to get more comprehensive statistics about model output quality and understood how these metrics appear in the LangSmith UI. This completed my understanding of different evaluation levels.

In [None]:
from langsmith import Client
from langsmith.evaluation import evaluate
from langchain_openai import ChatOpenAI
import numpy as np

client = Client()

def qa_system(inputs: dict) -> dict:
    llm = ChatOpenAI(model="gpt-4o-mini")
    return {"answer": llm.invoke(inputs["question"]).content}

# Summary evaluator - calculates aggregate metrics
def summary_statistics(runs, examples):
    """Calculate aggregate statistics across all runs"""
    scores = []
    
    for run in runs:
        # Get individual score (example: word overlap)
        predicted = run.outputs.get("answer", "")
        # Calculate a simple metric
        score = len(predicted.split()) / 100  # Normalize by length
        scores.append(score)
    
    return {
        "mean_score": np.mean(scores),
        "median_score": np.median(scores),
        "std_score": np.std(scores),
        "min_score": np.min(scores),
        "max_score": np.max(scores)
    }

# Individual evaluator
def length_evaluator(run, example):
    predicted = run.outputs.get("answer", "")
    word_count = len(predicted.split())
    
    # Score based on whether answer is reasonable length (10-100 words)
    if 10 <= word_count <= 100:
        score = 1.0
    else:
        score = 0.5
    
    return {"key": "length_appropriate", "score": score}

# Run evaluation with summary evaluator
results = evaluate(
    qa_system,
    data="qa_examples",
    evaluators=[length_evaluator],
    summary_evaluators=[summary_statistics]
)

print(f"Experiment with summary stats: {results['experiment_name']}")