In [None]:
# 将 llm_toy/src 加入 sys.path（稳健多候选）
import sys as _sys
from pathlib import Path as _Path
def _add_src_path():
    cands = [
        _Path.cwd()/'llm_toy'/'src',
        _Path.cwd()/'src',
        _Path.cwd().parent/'llm_toy'/'src',
        _Path.cwd().parent/'src',
    ]
    for base in list(_Path.cwd().parents)[:3]:
        cands += [base/'llm_toy'/'src', base/'src']
    for p in cands:
        if (p/'model.py').exists() and (p/'utils.py').exists():
            _sys.path.append(str(p.resolve()))
            print('已添加src路径:', p.resolve())
            return
    print('警告：未找到 llm_toy/src，请手动添加路径或调整工作目录。')
_add_src_path()


# 08 评测指标基础：Perplexity、BLEU-1、ROUGE-1/2

本Notebook提供生成式任务常见的轻量评测示例：Perplexity、BLEU-1、ROUGE-1/2。

In [None]:
import math
from collections import Counter
import torch
import pandas as pd
from torch.utils.data import DataLoader
from model import SimpleGPTModel
from trainer import SimpleDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


## Perplexity 计算

对一组文本，使用语言模型计算平均loss（按有效token加权），Perplexity = exp(mean_loss)。

In [None]:
def compute_perplexity(model, tokenizer, texts, max_length=128, batch_size=4):
    ds = SimpleDataset(texts=texts, tokenizer=tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size)
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    import torch
    with torch.no_grad():
        for batch in dl:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            tokens = attention_mask.sum().item()
            total_loss += loss.item() * tokens
            total_tokens += tokens
    mean_loss = total_loss / max(total_tokens, 1)
    ppl = math.exp(mean_loss) if mean_loss < 100 else float('inf')
    return mean_loss, ppl

simple = SimpleGPTModel(model_name='gpt2')
tok = simple.tokenizer
lm = simple.model.to(device)
sample_texts = [
    '自然语言处理让计算机理解文本。',
    'Transformer 使用 Self-Attention 建模序列。',
    'Fine-tuning 让预训练模型适配特定任务。',
]
compute_perplexity(lm, tok, sample_texts, max_length=64, batch_size=2)


## BLEU-1（简化）

只考虑unigram精确率，并引入简要brevity penalty（BP）。

In [None]:
def bleu1(hyp: str, ref: str):
    h = hyp.split()
    r = ref.split()
    if len(h) == 0: return 0.0
    ch, cr = Counter(h), Counter(r)
    overlap = sum(min(ch[w], cr[w]) for w in ch)
    prec = overlap / len(h)
    c, rlen = len(h), len(r)
    bp = math.exp(1 - rlen / c) if c < rlen and c > 0 else 1.0
    return bp * prec

bleu1('the cat is on mat', 'the cat is on the mat')


## ROUGE-1 / ROUGE-2（F1）

基于n-gram重叠计算Precision、Recall与F1。此处实现简单版本（空格分词）。

In [None]:
def ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def rouge_n(hyp: str, ref: str, n: int = 1):
    h_toks = hyp.split()
    r_toks = ref.split()
    if len(h_toks) < n or len(r_toks) < n:
        return 0.0, 0.0, 0.0
    from collections import Counter
    h_ngrams = Counter(ngrams(h_toks, n))
    r_ngrams = Counter(ngrams(r_toks, n))
    overlap = sum(min(h_ngrams[g], r_ngrams[g]) for g in h_ngrams)
    prec = overlap / max(sum(h_ngrams.values()), 1)
    rec = overlap / max(sum(r_ngrams.values()), 1)
    f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0.0
    return prec, rec, f1

rouge_n('the cat is on mat', 'the cat is on the mat', 1)
