In [1]:
# AutoDL官方学术资源加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import sys
import os
from typing import List, Dict
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from rouge_chinese import Rouge # type: ignore
from nltk.translate.bleu_score import sentence_bleu # type: ignore
import jieba # type: ignore

import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [3]:
class ModelEvaluator:
    def __init__(self):
        # 初始化基础模型和微调后的模型
        print("正在加载模型...")
        self.base_model, self.base_tokenizer = initialize_model_and_tokenizer(
            model_path="google/gemma-2-9b",
            cache_dir="/root/autodl-tmp/gemma",
            use_quantization=True
        )
        
        self.ft_model, self.ft_tokenizer = initialize_model_and_tokenizer(
            model_path="google/gemma-2-9b",
            cache_dir="/root/autodl-tmp/gemma",
            lora_path="/root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-43500",
            use_quantization=True
        )
        
        self.rouge = Rouge()
        print("模型加载完成！")
    
    def load_evaluation_datasets(self, test_size=100):
        """加载多个评估数据集"""
        evaluation_data = []
        
        # 1. LCCC对话数据集
        print("加载LCCC数据集...")
        lccc = load_dataset("silver/lccc", "base", split="test")
        lccc_samples = self._process_lccc_data(lccc, test_size)
        evaluation_data.extend(lccc_samples)
        
        # # 2. MKQA多语言问答数据集
        # print("加载MKQA数据集...")
        # mkqa = load_dataset("mkqa", split="test")
        # mkqa_samples = self._process_mkqa_data(mkqa, test_size)
        # evaluation_data.extend(mkqa_samples)
        
        # 3. CMRC2018阅读理解数据集
        print("加载CMRC2018数据集...")
        cmrc = load_dataset("hfl/cmrc2018", split="test")
        cmrc_samples = self._process_cmrc_data(cmrc, test_size)
        evaluation_data.extend(cmrc_samples)
        
        return evaluation_data
    
    def _process_lccc_data(self, dataset, size):
        """处理LCCC对话数据集"""
        samples = []
        for item in dataset[:size]:
            if len(item['conversation']) >= 2:
                samples.append({
                    'instruction': item['conversation'][-2],
                    'target': item['conversation'][-1],
                    'type': 'dialogue'
                })
        return samples
    
    # def _process_mkqa_data(self, dataset, size):
    #     """处理MKQA问答数据集"""
    #     samples = []
    #     for item in dataset[:size]:
    #         if item['queries']['zh_cn'] and item['answers']['zh_cn']:
    #             samples.append({
    #                 'instruction': item['queries']['zh_cn'],
    #                 'target': item['answers']['zh_cn'][0]['text'],
    #                 'type': 'qa'
    #             })
    #     return samples
    
    def _process_cmrc_data(self, dataset, size):
        """处理CMRC2018阅读理解数据集"""
        samples = []
        for item in dataset[:size]:
            context = item['context']
            for qa in item['qas']:
                samples.append({
                    'instruction': f"请根据以下文章回答问题：\n\n{context}\n\n问题：{qa['question']}",
                    'target': qa['answers'][0],
                    'type': 'reading_comprehension'
                })
        return samples
    
    def generate_responses(self, prompt: str, model, tokenizer) -> str:
        """生成模型回答"""
        return generate_response(
            model=model,
            tokenizer=tokenizer,
            prompt=prompt,
            max_new_tokens=1024,
            temperature=0.7
        )
    
    def calculate_metrics(self, reference: str, hypothesis: str) -> Dict:
        """计算评估指标"""
        # 分词
        ref_tokens = ' '.join(jieba.cut(reference))
        hyp_tokens = ' '.join(jieba.cut(hypothesis))
        
        # 计算ROUGE分数
        rouge_scores = self.rouge.get_scores(hyp_tokens, ref_tokens)[0]
        
        # 计算BLEU分数
        ref_tokens = list(jieba.cut(reference))
        hyp_tokens = list(jieba.cut(hypothesis))
        bleu_score = sentence_bleu([ref_tokens], hyp_tokens)
        
        return {
            'rouge-1': rouge_scores['rouge-1']['f'],
            'rouge-2': rouge_scores['rouge-2']['f'],
            'rouge-l': rouge_scores['rouge-l']['f'],
            'bleu': bleu_score
        }
    
    def evaluate_models(self, evaluation_data):
        """评估模型性能"""
        results = {
            'dialogue': {'base': [], 'ft': []},
            'qa': {'base': [], 'ft': []},
            'reading_comprehension': {'base': [], 'ft': []}
        }
        
        for item in tqdm(evaluation_data, desc="评估进度"):
            # 构建对话
            dialogue = [
                {"role": "system", "content": "你是一个专业、友好的AI助手。"},
                {"role": "user", "content": item['instruction']}
            ]
            prompt = apply_chat_template(dialogue)
            
            # 生成回答
            base_response = self.generate_responses(prompt, self.base_model, self.base_tokenizer)
            ft_response = self.generate_responses(prompt, self.ft_model, self.ft_tokenizer)
            
            # 计算指标
            base_metric = self.calculate_metrics(item['target'], base_response)
            ft_metric = self.calculate_metrics(item['target'], ft_response)
            
            # 按任务类型存储结果
            results[item['type']]['base'].append(base_metric)
            results[item['type']]['ft'].append(ft_metric)
        
        return results

In [None]:
def calculate_average_metrics(metrics_list):
    """计算平均指标"""
    if not metrics_list:
        return None
    return {k: np.mean([m[k] for m in metrics_list]) for k in metrics_list[0].keys()}

In [4]:
def main():
    # 初始化评估器
    evaluator = ModelEvaluator()
    
    # 加载评估数据集
    print("\n加载评估数据集...")
    evaluation_data = evaluator.load_evaluation_datasets(test_size=50)  # 每个数据集50个样本
    
    # 评估模型
    print("\n开始评估模型...")
    results = evaluator.evaluate_models(evaluation_data)
    
    # 输出结果
    task_types = ['dialogue', 'qa', 'reading_comprehension']
    
    for task in task_types:
        print(f"\n=== {task} 任务评估结果 ===")
        
        base_avg = calculate_average_metrics(results[task]['base'])
        ft_avg = calculate_average_metrics(results[task]['ft'])
        
        if base_avg and ft_avg:
            print(f"\n基础模型评估结果:")
            for metric, score in base_avg.items():
                print(f"{metric}: {score:.4f}")
                
            print(f"\n微调后模型评估结果:")
            for metric, score in ft_avg.items():
                print(f"{metric}: {score:.4f}")
                
            print(f"\n性能提升:")
            for metric in base_avg.keys():
                improvement = ((ft_avg[metric] - base_avg[metric]) / base_avg[metric]) * 100
                print(f"{metric}: {improvement:+.2f}%")

In [5]:
if __name__ == "__main__":
    main()

正在加载模型...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



模型加载完成！

加载评估数据集...
加载LCCC数据集...


DatasetNotFoundError: Dataset 'thu-coai/lccc-base' doesn't exist on the Hub or cannot be accessed.