In [1]:
import sys
sys.path.append('..')

import re
import os
import json
from types import SimpleNamespace
import pandas as pd
import torch
from collections import defaultdict
from transformers import AutoTokenizer
from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
from model.model_lora import load_lora, apply_lora

In [2]:
DATA_LOC = '/root/xlcoder/MiniMind2-Small/dataset'
INSTR_LOC = '/root/xlcoder/MiniMind2-Small/hands-on'

In [3]:
test_data = pd.read_csv(os.path.join(DATA_LOC, 'bbc_test_std.csv'))

In [4]:
LABELS = ["business", "entertainment", "politics", "sport", "tech"]
LABEL_SET = set(LABELS)


In [5]:
def parse_label(text: str):
    text = text.lower().strip()

    # 直接命中
    if text in LABEL_SET:
        return text

    # 在文本中搜索
    for label in LABELS:
        if re.search(rf"\b{label}\b", text):
            return label

    return None  # 解析失败


In [6]:
class ClassificationEvalDataset:
    def __init__(
        self,
        df: pd.DataFrame,
        use_instruction=True,
        use_title=True,
        instruction_position="head",
    ):
        self.samples = df

        # with open(os.path.join(INSTR_LOC, 'few_shot1_pre.txt'), 'r') as pre_file:
        #     self.instruction = pre_file.read()
        self.instruction = 'Classify the following passage into one of the categories: business, entertainment, politics, sport, tech.'
        self.use_instruction = use_instruction
        self.use_title = use_title
        self.instruction_position = instruction_position

    def __len__(self):
        return len(self.samples)

    def format_prompt(self, ex):
        parts = []

        instruction = self.instruction
        title = ex.title
        content = ex.content

        if self.use_instruction and self.instruction_position == "head":
            parts.append(instruction)
            parts.append("")

        if self.use_title and title:
            parts.append("Title:")
            parts.append(title)
            parts.append("")

        parts.append("Text:")
        parts.append(content)
        parts.append("")

        if self.use_instruction and self.instruction_position == "middle":
            parts.append(instruction)
            parts.append("")

        parts.append("Label:")

        return "\n".join(parts)


In [7]:
@torch.no_grad()
def evaluate_classification(
    model,
    tokenizer,
    eval_dataset,
    device,
    max_new_tokens=8,
):
    model.eval()

    correct = 0
    total = 0

    confusion = defaultdict(lambda: defaultdict(int))
    failed = 0

    for ex in eval_dataset.samples.iterrows():
        ex = ex[1]
        prompt = eval_dataset.format_prompt(ex)
        gold = ex.category

        inputs = tokenizer(
            tokenizer.bos_token + prompt,
            return_tensors="pt",
            truncation=True
        ).to(device)

        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        gen_text = tokenizer.decode(
            outputs[0][inputs["input_ids"].size(1):],
            skip_special_tokens=True
        )

        pred = parse_label(gen_text)

        if pred is None:
            failed += 1
            continue

        confusion[gold][pred] += 1
        correct += int(pred == gold)
        total += 1

    accuracy = correct / total if total > 0 else 0.0

    return {
        "accuracy": accuracy,
        "total": total,
        "failed": failed,
        "confusion": confusion,
    }


In [8]:
CLS_TOKENS = {
    "business": "<CLS_B>",
    "entertainment": "<CLS_E>",
    "politics": "<CLS_P>",
    "sport": "<CLS_S>",
    "tech": "<CLS_T>",
}

def build_cls_id_map(tokenizer):
    cls_id_map = {}
    for label, tok in CLS_TOKENS.items():
        ids = tokenizer(tok, add_special_tokens=False).input_ids
        assert len(ids) == 1, f"{tok} is not single-token!"
        cls_id_map[label] = ids[0]
    return cls_id_map
    
@torch.no_grad()
def evaluate_logits_classification(
    model,
    tokenizer,
    eval_dataset,
    device,
):
    model.eval()
    cls_id_map = build_cls_id_map(tokenizer)

    correct = 0
    total = 0
    confusion = defaultdict(lambda: defaultdict(int))

    for ex in eval_dataset.samples.iterrows():
        ex = ex[1]
        prompt = eval_dataset.format_prompt(ex)
        gold = ex.category          # e.g. "<CLS_P>"

        inputs = tokenizer(
            tokenizer.bos_token + prompt,
            return_tensors="pt",
            truncation=True
        ).to(device)

        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )

        logits = outputs.logits  # [1, seq_len, vocab]
        last_pos = inputs["attention_mask"].sum(dim=1).item() - 1

        scores = {}
        for label, cls_id in cls_id_map.items():
            scores[label] = logits[0, last_pos, cls_id].item()

        pred = max(scores, key=scores.get)

        gold_label = ''
        for k, v in CLS_TOKENS.items():
            if gold == v:
                gold_label = k
                break

        confusion[gold_label][pred] += 1
        correct += int(pred == gold_label)
        total += 1

    acc = correct / total
    return {
        "accuracy": acc,
        "total": total,
        "failed": total - correct,
        "confusion": confusion,
    }



In [9]:
def print_confusion_matrix(confusion):
    labels = LABELS
    print("\nConfusion Matrix:")
    print("gold\\pred".ljust(15) + "".join(l.ljust(15) for l in labels))

    for g in labels:
        row = g.ljust(15)
        for p in labels:
            row += str(confusion[g].get(p, 0)).ljust(15)
        print(row)


In [10]:
SPECIAL_LABELS = [
    "<CLS_B>",
    "<CLS_E>",
    "<CLS_P>",
    "<CLS_S>",
    "<CLS_T>",
]

def init_model(args):
    tokenizer = AutoTokenizer.from_pretrained(args.load_from)
    tokenizer.add_special_tokens({
        "additional_special_tokens": SPECIAL_LABELS
    })
    if 'model' in args.load_from:
        model = MiniMindForCausalLM(MiniMindConfig(
            hidden_size=args.hidden_size,
            num_hidden_layers=args.num_hidden_layers,
            use_moe=bool(args.use_moe),
            inference_rope_scaling=args.inference_rope_scaling
        ))
        moe_suffix = '_moe' if args.use_moe else ''
        ckp = f'./{args.save_dir}/{args.weight}_{args.hidden_size}{moe_suffix}.pth'
        model.resize_token_embeddings(len(tokenizer))
        model.load_state_dict(torch.load(ckp, map_location=args.device), strict=True)
        if args.lora_weight != 'None':
            apply_lora(model, rank=1024)
            load_lora(model, f'./{args.save_dir}/lora/{args.lora_weight}_{args.hidden_size}.pth')
    else:
        model = AutoModelForCausalLM.from_pretrained(args.load_from, trust_remote_code=True)
        model.resize_token_embeddings(len(tokenizer))
    print(f'MiniMind模型参数: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M(illion)')
    return model.eval().to(args.device), tokenizer

In [11]:
args={
    'load_from': '../model',
    'save_dir': '../out',
    'weight': 'en_text_cls_logits',
    'lora_weight': 'None',
    'hidden_size': 512,
    'num_hidden_layers': 8,
    'use_moe': 0,
    'inference_rope_scaling': False,
    'max_new_tokens':8192,
    'temperature':0.85,
    'top_p':0.85,
    'historys':0,
    'device':'cuda' if torch.cuda.is_available() else 'cpu',
    'times': 1,
    'count_only': False
}
args = json.loads(json.dumps(args), object_hook=lambda d: SimpleNamespace(**d))

In [12]:
model, tokenizer = init_model(args)

eval_ds = ClassificationEvalDataset(
    test_data,
    use_instruction=True,
    use_title=True,
    instruction_position="head",
)

# res = evaluate_classification(
#     model,
#     tokenizer,
#     eval_ds,
#     device=args.device,
# )

res = evaluate_logits_classification(
    model,
    tokenizer,
    eval_ds,
    device=args.device,
)

print("Accuracy:", res["accuracy"])
print("Total:", res["total"])
print("Failed:", res["failed"])
print_confusion_matrix(res["confusion"])


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


MiniMind模型参数: 25.83 M(illion)
Accuracy: 0.39414414414414417
Total: 444
Failed: 269

Confusion Matrix:
gold\pred      business       entertainment  politics       sport          tech           
business       0              0              0              4              98             
entertainment  0              0              0              72             5              
politics       0              0              0              2              81             
sport          0              0              0              102            0              
tech           0              0              0              7              73             
