
# 6.3.7 - Evaluation Metrics for QA Performance

In this notebook, we explore common evaluation metrics used to assess QA model performance, especially for both extractive and generative tasks.

Metrics Covered:
- Exact Match (EM)
- F1 Score
- Semantic F1
- Latency & Throughput
- ROUGE/BLEU for generative QA


In [None]:

!pip install evaluate transformers datasets sentence-transformers rouge-score pyRAPL


In [None]:

from evaluate import load
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
import time
import pyRAPL
import torch


In [None]:

preds = ["Paris", "Isaac Newton", "299,792,458 m/s"]
refs = ["Paris", "Newton", "299792458 meters per second"]


## Exact Match (EM) and F1 Score

In [None]:

metric = load("squad")
results = metric.compute(predictions=preds, references=refs)
print(f"EM: {results['exact_match']:.2f}, F1: {results['f1']:.2f}")


## Semantic F1 with Sentence Embeddings

In [None]:

model = SentenceTransformer("all-MiniLM-L6-v2")
semantic_scores = [util.cos_sim(model.encode(p), model.encode(r)).item() for p, r in zip(preds, refs)]
semantic_f1 = sum(semantic_scores) / len(semantic_scores)
print(f"Semantic F1: {semantic_f1:.2f}")


## ROUGE-L for Generative QA

In [None]:

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(pred, ref)["rougeL"].fmeasure for pred, ref in zip(preds, refs)]
avg_rouge = sum(rouge_scores) / len(rouge_scores)
print(f"Average ROUGE-L: {avg_rouge:.2f}")


## Latency and Throughput Measurement

In [None]:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

question = "Who discovered gravity?"
context = "Isaac Newton formulated the law of universal gravitation."

inputs = tokenizer(question, context, return_tensors="pt")

start = time.time()
_ = model(**inputs)
end = time.time()

latency = end - start
throughput = 1 / latency

print(f"Latency: {latency:.4f}s, Throughput: {throughput:.2f} queries/sec")


## Energy per Query (CPU Usage Estimation)

In [None]:

pyRAPL.setup()

@pyRAPL.measureit()
def run_query():
    return model(**inputs)

result = run_query()
print(f"Energy consumption: {result.energy} µJ")
