In [2]:
!pip install scikit-learn numpy sentence_transformers requests

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


# 1) 答案相关性
答案相关性是评估LLM是否能够有效回答用户问题的关键指标。它衡量模型输出是否能够准确、全面地回应输入信息。例如，在问答系统中，答案相关性指标会评估模型是否提供了有用的、与问题相关的回答。

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_relevance_score(predicted_answer, reference_answer, model):
    pred_embedding = model.encode(predicted_answer)
    ref_embedding = model.encode(reference_answer)
    score = cosine_similarity([pred_embedding], [ref_embedding])[0][0]
    return score

# 示例
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
predicted_answer = "The capital of France is Paris."
reference_answer = "Paris is the capital city of France."
relevance_score = compute_relevance_score(predicted_answer, reference_answer, model)
print(f"Relevance Score: {relevance_score}")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Relevance Score: 0.9697394371032715


# 2) 正确性
正确性指标用于验证LLM的输出是否符合事实。它可以通过比对模型生成的内容与已知的真实信息来计算。这对于确保模型在处理事实性问题时的准确性尤其重要。

In [9]:
import requests

def check_factual_accuracy(output_text, fact_checking_api_url):
    response = requests.post(fact_checking_api_url, data={'text': output_text})
    result = response.json()
    return result['is_factual']

# 示例
fact_checking_api_url = "https://example.com/fact-check-api"
output_text = "The Earth revolves around the Sun."
is_factual = check_factual_accuracy(output_text, fact_checking_api_url)
print(f"Is Factual: {is_factual}")

# 3) 幻觉检测
幻觉指的是LLM生成虚假或不准确的信息。检测幻觉对于提高模型的可靠性和用户信任至关重要。幻觉检测可以通过人工审核或结合自动化工具来实现。

In [5]:
def detect_hallucination(output_text, known_facts):
    hallucinations = [fact for fact in known_facts if fact not in output_text]
    return len(hallucinations) > 0

# 示例
known_facts = ["The capital of France is Paris.", "Water boils at 100 degrees Celsius."]
output_text = "The capital of France is Berlin."
has_hallucination = detect_hallucination(output_text, known_facts)
print(f"Has Hallucination: {has_hallucination}")

Has Hallucination: True


# 4) 上下文相关性
在基于检索增强生成（RAG）的系统中，上下文相关性指标评估模型是否能够有效利用检索到的相关信息生成回答。这种指标确保模型在生成文本时考虑了合适的背景信息。

In [6]:
def evaluate_context_relevance(retrieved_context, generated_output, similarity_model):
    context_embedding = similarity_model.encode(retrieved_context)
    output_embedding = similarity_model.encode(generated_output)
    similarity_score = cosine_similarity([context_embedding], [output_embedding])[0][0]
    return similarity_score

# 示例
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
retrieved_context = "The Eiffel Tower is located in Paris, France."
generated_output = "The Eiffel Tower is a famous landmark in Paris."
context_relevance_score = evaluate_context_relevance(retrieved_context, generated_output, similarity_model)
print(f"Context Relevance Score: {context_relevance_score}")



Context Relevance Score: 0.9187588095664978


# 5) 责任指标
责任指标评估LLM的输出是否包含偏见、毒性或其他可能的有害内容。这些指标对确保模型输出符合伦理标准和社会期望至关重要。

In [10]:
def check_toxicity(output_text, toxicity_api_url):
    response = requests.post(toxicity_api_url, data={'text': output_text})
    result = response.json()
    return result['toxicity_score']

# 示例
toxicity_api_url = "https://example.com/toxicity-api"
output_text = "This is a hate speech example."
toxicity_score = check_toxicity(output_text, toxicity_api_url)
print(f"Toxicity Score: {toxicity_score}")

# 6) 任务特定指标
任务特定指标根据具体的应用场景和需求定制，如在文本摘要任务中评估摘要的全面性和一致性。这些指标通常需要根据任务特点进行设计和调整。

In [8]:
def evaluate_summary_completeness(summary, original_text):
    completeness_score = len(summary) / len(original_text) # 简单比例衡量
    return completeness_score

# 示例
original_text = "In-depth information about the history of France and its landmarks."
summary = "A brief history of France."
completeness_score = evaluate_summary_completeness(summary, original_text)
print(f"Summary Completeness Score: {completeness_score}")

Summary Completeness Score: 0.3880597014925373
