# 检索效果评估
## 检索相关性评估

In [1]:
def evaluate_retrieval_effectiveness(questions):
    """
    评估不同检索方法对各类问题的效果
    """
    results = []
    
    for question in questions:
        query_type = determine_query_type(question)
        bm25_weight, vector_weight = get_dynamic_weights(query_type)
        
        # 获取不同检索器的结果
        vectordb = get_vectordb()
        documents = get_text_list_from_milvus(collection_name="Vmaxs")
        
        # BM25检索
        bm25_retriever = BM25Retriever.from_texts(documents)
        bm25_docs = bm25_retriever.get_relevant_documents(question)
        
        # 向量检索
        vector_retriever = vectordb.as_retriever(search_kwargs={"k": 10})
        vector_docs = vector_retriever.get_relevant_documents(question)
        
        # 混合检索
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, vector_retriever],
            weights=[bm25_weight, vector_weight]
        )
        ensemble_docs = ensemble_retriever.get_relevant_documents(question)
        
        # 人工评估相关性(简化版，实际应有人工标注)
        bm25_rel = len([d for d in bm25_docs if is_relevant(d.page_content, question)])
        vector_rel = len([d for d in vector_docs if is_relevant(d.page_content, question)])
        ensemble_rel = len([d for d in ensemble_docs if is_relevant(d.page_content, question)])
        
        results.append({
            "question": question,
            "query_type": query_type,
            "bm25_relevant": bm25_rel,
            "vector_relevant": vector_rel,
            "ensemble_relevant": ensemble_rel,
            "bm25_weight": bm25_weight,
            "vector_weight": vector_weight
        })
    
    return results

# 示例评估函数(实际使用时需要完善)
def is_relevant(document_content, question):
    """简化版相关性判断，实际应有人工标注或更复杂的逻辑"""
    return any(keyword in document_content for keyword in question.split()[:3])

## 问答质量评估

In [2]:
def evaluate_answer_quality(questions, reference_answers):
    """
    评估回答质量，对比参考答案
    """
    evaluation_results = []
    
    for question, ref_answer in zip(questions, reference_answers):
        result = get_qa_chain_with_memory(question)
        answer = result['answer']
        
        # 计算相似度(简化版，实际可用更复杂的语义相似度计算)
        similarity = calculate_similarity(answer, ref_answer)
        
        evaluation_results.append({
            "question": question,
            "generated_answer": answer,
            "reference_answer": ref_answer,
            "similarity_score": similarity,
            "query_type": determine_query_type(question)
        })
    
    return evaluation_results

def calculate_similarity(answer1, answer2):
    """简化版相似度计算"""
    words1 = set(answer1.lower().split())
    words2 = set(answer2.lower().split())
    intersection = words1.intersection(words2)
    return len(intersection) / max(len(words1), len(words2))

## 权重分配合理性评估

In [3]:
def evaluate_weight_allocation(questions):
    """
    评估权重分配策略的合理性
    """
    weight_results = []
    
    for question in questions:
        query_type = determine_query_type(question)
        bm25_weight, vector_weight = get_dynamic_weights(query_type)
        
        # 获取不同检索器的结果
        vectordb = get_vectordb()
        documents = get_text_list_from_milvus(collection_name="Vmaxs")
        
        # 不同权重组合的检索效果
        weight_combinations = [
            (0.7, 0.3),  # 偏BM25
            (0.3, 0.7),  # 偏向量
            (0.5, 0.5),  # 平衡
            (bm25_weight, vector_weight)  # 动态分配
        ]
        
        best_relevant = 0
        best_weights = (0, 0)
        
        for weights in weight_combinations:
            bm25_retriever = BM25Retriever.from_texts(documents)
            vector_retriever = vectordb.as_retriever(search_kwargs={"k": 10})
            
            ensemble_retriever = EnsembleRetriever(
                retrievers=[bm25_retriever, vector_retriever],
                weights=weights
            )
            
            docs = ensemble_retriever.get_relevant_documents(question)
            relevant = len([d for d in docs if is_relevant(d.page_content, question)])
            
            if relevant > best_relevant:
                best_relevant = relevant
                best_weights = weights
        
        weight_results.append({
            "question": question,
            "query_type": query_type,
            "dynamic_weights": (bm25_weight, vector_weight),
            "best_weights": best_weights,
            "match": (bm25_weight, vector_weight) == best_weights
        })
    
    return weight_results

## 综合评估报告生成

In [4]:
def generate_evaluation_report(questions, reference_answers=None):
    """
    生成完整的评估报告
    """
    print("="*50)
    print("混合检索问答系统评估报告")
    print("="*50)
    
    # 1. 检索效果评估
    print("\n1. 检索效果评估:")
    retrieval_results = evaluate_retrieval_effectiveness(questions)
    for res in retrieval_results:
        print(f"\n问题: {res['question']}")
        print(f"类型: {res['query_type']}")
        print(f"权重分配: BM25={res['bm25_weight']}, Vector={res['vector_weight']}")
        print(f"BM25相关文档数: {res['bm25_relevant']}/10")
        print(f"向量检索相关文档数: {res['vector_relevant']}/10")
        print(f"混合检索相关文档数: {res['ensemble_relevant']}/10")
    
    # 2. 问答质量评估(如果有参考答案)
    if reference_answers:
        print("\n2. 问答质量评估:")
        answer_results = evaluate_answer_quality(questions, reference_answers)
        avg_similarity = sum(res['similarity_score'] for res in answer_results) / len(answer_results)
        
        for res in answer_results:
            print(f"\n问题: {res['question']}")
            print(f"类型: {res['query_type']}")
            print(f"相似度得分: {res['similarity_score']:.2f}")
        
        print(f"\n平均相似度得分: {avg_similarity:.2f}")
    
    # 3. 权重分配评估
    print("\n3. 权重分配合理性评估:")
    weight_results = evaluate_weight_allocation(questions)
    correct_weights = sum(res['match'] for res in weight_results)
    
    for res in weight_results:
        print(f"\n问题: {res['question']}")
        print(f"类型: {res['query_type']}")
        print(f"动态分配权重: BM25={res['dynamic_weights'][0]}, Vector={res['dynamic_weights'][1]}")
        print(f"最佳权重: BM25={res['best_weights'][0]}, Vector={res['best_weights'][1]}")
        print(f"匹配: {'是' if res['match'] else '否'}")
    
    print(f"\n权重分配准确率: {correct_weights/len(questions):.2%}")
    
    print("\n评估完成!")

## 使用示例

In [None]:
# 测试问题和参考答案(示例)
questions = [
    "什么是VMAX的上网日志业务？",
    "上网日志业务包含哪些功能？",
    "为什么我的VMAX设备会出现日志丢失问题？",
    "如何解决VMAX日志存储空间不足的问题？",
    "VMAX-S与其他型号的主要区别是什么？"
]

reference_answers = [
    "VMAX的上网日志业务是指...",  # 实际应提供完整参考答案
    "上网日志业务主要功能包括...",
    "VMAX设备日志丢失可能由...原因造成",
    "解决日志存储空间不足的方法有...",
    "VMAX-S与其他型号的区别主要体现在..."
]

# 生成评估报告
generate_evaluation_report(questions, reference_answers)