In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
from tqdm import tqdm
import numpy as np

In [2]:
import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [3]:
def calculate_ppl(model, tokenizer, dataset_path, device="cuda"):
    """
    计算模型在给定数据集上的困惑度(PPL)
    """
    # 加载数据集
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    
    total_loss = 0
    total_length = 0
    
    model.eval()
    with torch.no_grad():
        for item in tqdm(dataset, desc="计算PPL"):
            # 获取输入文本
            input_text = item["text"]  # 根据你的数据集格式调整
            # 编码输入
            inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # 获取模型输出
            outputs = model(**inputs)
            logits = outputs.logits
            
            # 手动计算 cross entropy loss
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = inputs["input_ids"][..., 1:].contiguous()
            
            # 计算损失
            loss_fct = torch.nn.CrossEntropyLoss(reduction='sum')
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), 
                          shift_labels.view(-1))
            
            # 累加 loss 和 token 数量
            total_loss += loss.item()
            total_length += shift_labels.numel()
    
    # 计算平均困惑度
    avg_loss = total_loss / total_length
    ppl = np.exp(avg_loss)
    
    return ppl

In [4]:
def eval_model_ppl():
    # 设置路径
    base_model_path = "google/gemma-2-9b"
    cache_dir = "/root/autodl-tmp/gemma"
    lora_path = "/root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-43500"
    eval_data_path = "../data_processing/stage1/data_final/valid.json"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # 加载基础模型
    print("加载基础模型...")
    base_model, tokenizer = initialize_model_and_tokenizer(
        model_path=base_model_path,
        cache_dir=cache_dir,
        use_quantization=True
    )
    base_model.eval()
    
    # 计算基础模型的PPL
    print("计算基础模型PPL...")
    base_ppl = calculate_ppl(base_model, tokenizer, eval_data_path, device)
    print(f"基础模型PPL: {base_ppl:.4f}")
    
    # 加载训练后的模型
    print("加载微调后的模型...")
    model = PeftModel.from_pretrained(
        base_model,
        lora_path,
        is_trainable=False  # 设置为评估模式
    )
    model.eval()
    
    # 计算微调后模型的PPL
    print("计算微调后模型PPL...")
    trained_ppl = calculate_ppl(model, tokenizer, eval_data_path, device)
    print(f"微调后模型PPL: {trained_ppl:.4f}")
    
    # 计算改进百分比
    improvement = ((base_ppl - trained_ppl) / base_ppl) * 100
    print(f"PPL改进比例: {improvement:.2f}%")

    

In [5]:
if __name__ == "__main__":
    eval_model_ppl()

加载基础模型...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

计算基础模型PPL...


计算PPL:   0%|          | 0/1000 [00:00<?, ?it/s]The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
计算PPL: 100%|██████████| 1000/1000 [02:17<00:00,  7.26it/s]


基础模型PPL: 16.5727
加载微调后的模型...
计算微调后模型PPL...


计算PPL: 100%|██████████| 1000/1000 [02:28<00:00,  6.71it/s]


微调后模型PPL: 3.8577
PPL改进比例: 76.72%
