In [None]:
import evaluate

# Evaluate使用指南

## 查看支持的评估函数

In [None]:
# todo:https://huggingface.co/tasks  huggingface的文档里的tasks可以查看支持哪些任务，每个点进去最右下就是这个任务用到的评估指标

# 在2024-01-11的测试中，list_evaluation_modules无法完全显示支持的评估函数，但不影响使用
# 完成的评估函数可以在 https://huggingface.co/evaluate-metric 中查看
# 支持这些评估函数
evaluate.list_evaluation_modules(with_details=True) # with_details=True可以看用于哪些任务

[{'name': 'lvwerra/test', 'type': 'metric', 'community': True, 'likes': 0},
 {'name': 'angelina-wang/directional_bias_amplification',
  'type': 'metric',
  'community': True,
  'likes': 6},
 {'name': 'cpllab/syntaxgym', 'type': 'metric', 'community': True, 'likes': 1},
 {'name': 'lvwerra/bary_score',
  'type': 'metric',
  'community': True,
  'likes': 1},]

In [None]:
evaluate.list_evaluation_modules(
  module_type="comparison",
  include_community=False,
  with_details=True)

## 加载评估函数

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
# todo:查看文档
accuracy

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value('int32'), 'references': Value('int32')}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.        # 预测标签
    references (`list` of `int`): Ground truth labels.      # 真实标签
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.      # 是否进行归1化
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

    Example 2-The same as Example 1, except with `normalize` set to `False`.
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)  # 不归一化，返回正确个数
        >>> print(results)
        {'accuracy': 3.0}

    Example 3-The same as Example 1, except with `sample_weight` set.
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
        >>> print(results)
        {'accuracy': 0.8778625954198473}
""", stored examples: 0)

## 查看函数说明

In [None]:
# todo:简单介绍
print(accuracy.description)

In [None]:
# todo:查看输入格式
print(accuracy.inputs_description)

In [None]:
accuracy

## 评估指标计算——全局计算

In [None]:
# 一次性给出所有的预测值和真实值进行计算
accuracy = evaluate.load("accuracy")
results = accuracy.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
results

## 评估指标计算——迭代计算

In [None]:
accuracy = evaluate.load("accuracy")
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
    accuracy.add(references=ref, predictions=pred)
accuracy.compute()

In [None]:
# 分批给出，add进accuracy，最后再一次性compute
# todo:对于batch,一次添加一整个batch进去
accuracy = evaluate.load("accuracy")
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    accuracy.add_batch(references=refs, predictions=preds)
accuracy.compute()

## 多个评估指标计算

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"]) # 同时用多个评估函数
clf_metrics

In [None]:
clf_metrics

In [None]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

## 评估结果对比可视化

In [None]:
from evaluate.visualization import radar_plot   # 目前只支持雷达图

In [None]:
data = [
   {"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
   {"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
   {"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6}, 
   {"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
   ]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]

In [None]:
plot = radar_plot(data=data, model_names=model_names)