In [1]:
import torch
torch.cuda.empty_cache()
device = 'cuda:5'

# GAIRMath-Abel-13b

In [2]:
# 指定本地模型路径
# model_path = "/data1/ckpts/Aquila2-34B/"
# model_path = "/data1/ckpts/Baichuan-13B-Base/"
# model_path = "/data1/ckpts/Baichuan-7B/"
# model_path = "/data1/ckpts/Baichuan2-7B-Base/"
# model_path = "/data1/ckpts/chatglm-6b/"
model_path = "/data1/ckpts/GAIRMath-Abel-13b/"
# model_path = "/data1/ckpts/GAIRMath-Abel-7b/"
# model_path = "/data1/ckpts/phi-1_5/"
# model_path = "/data1/ckpts/Qwen-7B/"

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm import tqdm
import os

# 使用本地模型路径加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).half().to(device)
model = model.eval()

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
from torch.nn import CrossEntropyLoss
import numpy as np

def calculate_ppl(dataset, model, tokenizer, device):
    ppls = []

    for question, answer in tqdm(zip(dataset['question'], dataset['answer']), total=len(dataset['question'])): 
        combined_text = question + '\n\n' + answer
        encodings = tokenizer(combined_text, return_tensors='pt')

        input_ids = encodings.input_ids.to(device)
        labels = torch.cat([input_ids[:, 1:], torch.tensor([[tokenizer.eos_token_id]], device=device)], dim=1)
        
        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            logits = outputs.logits

        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., :-1].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        log_likelihood = losses.sum() / losses.numel()
        perplexity = torch.exp(log_likelihood)
        ppls.append(perplexity.item())
        
    return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}

# 加载本地数据集
dataset_train = load_dataset('json', data_files='/data/rjxu/gsm8k_train.jsonl', split='train')
dataset_test = load_dataset('json', data_files='/data/rjxu/gsm8k_test.jsonl', split='train')

average_train_ppl = calculate_ppl(dataset_train, model, tokenizer, device)
print(f'Average Perplexity of the dataset_train: {average_train_ppl["mean_perplexity"]}')
average_test_ppl = calculate_ppl(dataset_test, model, tokenizer, device)
print(f'Average Perplexity of the dataset_test: {average_test_ppl["mean_perplexity"]}')
print(f'Difference(test - train): {average_test_ppl["mean_perplexity"]-average_train_ppl["mean_perplexity"]}')

100%|██████████| 7473/7473 [14:00<00:00,  8.89it/s]


Average Perplexity of the dataset_train: 4.093652076340257


100%|██████████| 1319/1319 [02:31<00:00,  8.72it/s]

Average Perplexity of the dataset_test: 12.659963563321845
Difference(test - train): 8.566311486981588



