In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import json
import os
from tqdm import tqdm # For progress bar

import config
from optimized_prompt_template import OPTIMIZED_PROMPT_SYSTEM_MESSAGE, OPTIMIZED_PROMPT_CORE_INSTRUCTIONS, FEW_SHOT_EXAMPLES_TEXT

# --- 修改 generate_prediction_for_test 为 generate_prediction_for_batch ---
# 接受一个文本列表，返回一个预测结果列表
def generate_prediction_for_batch(model, tokenizer, comment_texts):
    # 构造用户消息列表
    user_message_contents = []
    for text in comment_texts:
        user_message_contents.append(f"{OPTIMIZED_PROMPT_CORE_INSTRUCTIONS}\n\n{FEW_SHOT_EXAMPLES_TEXT}\n\n[待处理文本]\n{text}")

    # 应用聊天模板，对整个批次进行分词
    # padding=True 会将批次中的所有序列填充到最长序列的长度
    # truncation=True 确保不会超过模型的最大长度
    prompt_for_models = [
        tokenizer.apply_chat_template(
            [
                {"role": "system", "content": OPTIMIZED_PROMPT_SYSTEM_MESSAGE},
                {"role": "user", "content": content_text}
            ],
            tokenize=False,
            add_generation_prompt=True
        ) for content_text in user_message_contents
    ]

    # 对整个批次进行分词
    # padding=True 是批量推理的关键，它会将所有序列填充到批次中最长的序列长度
    # truncation=True 是为了避免过长的输入导致OOM，需要结合 config.MAX_SEQ_LENGTH
    inputs = tokenizer(
        prompt_for_models,
        return_tensors="pt",
        padding=True, # <-- 关键改变：启用填充
        truncation=True, # <-- 建议添加：启用截断
        max_length=getattr(config, 'MAX_SEQ_LENGTH', 2048) # <-- 建议在 config 中定义，否则使用默认值
    ).to(model.device)

    # 记录原始输入的长度，用于后续解码时截断模型生成的响应
    # 注意：对于批量，所有输入的长度都因为 padding=True 而变得一样
    input_ids_len = inputs.input_ids.shape[1]

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=512,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )

    # 处理批次生成的输出
    all_assistant_responses = []
    # outputs 的形状是 [batch_size, generated_sequence_length]
    for i in range(len(comment_texts)):
        # 解码时从原始输入长度之后开始，以获取助手的回复部分
        generated_ids_for_sample = outputs[i][input_ids_len:]
        assistant_response = tokenizer.decode(generated_ids_for_sample, skip_special_tokens=True).strip()

        # 后处理
        if not assistant_response:
            assistant_response = "NULL | NULL | non-hate | non-hate [END]"
        elif not assistant_response.strip().endswith("[END]"):
            assistant_response = assistant_response.strip() + " [END]" # Basic cleanup

        all_assistant_responses.append(assistant_response)

    return all_assistant_responses


if __name__ == "__main__":
    adapter_path = os.path.join(config.OUTPUT_DIR, "final_lora_adapter")
    if not os.path.exists(adapter_path):
        raise FileNotFoundError(f"Fine-tuned adapter not found at {adapter_path}. Please run train.py first.")

    tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        # tokenizer.pad_token_id = tokenizer.eos_token_id # 确保 pad_token_id 在此设置后正确

    compute_dtype_inf = getattr(torch, config.BNB_4BIT_COMPUTE_DTYPE)
    bnb_config_inf = None
    if config.USE_4BIT_QUANTIZATION: # Optional: Quantize for inference too
        bnb_config_inf = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type=config.BNB_4BIT_QUANT_TYPE,
            bnb_4bit_compute_dtype=compute_dtype_inf,
        )

    base_model = AutoModelForCausalLM.from_pretrained(
        config.BASE_MODEL_ID,
        quantization_config=bnb_config_inf,
        torch_dtype=compute_dtype_inf if bnb_config_inf else torch.bfloat16,
        device_map={"": 0}, # 强制加载到 GPU 0，确保显存够用
        trust_remote_code=True,
    )

    model = PeftModel.from_pretrained(base_model, adapter_path)
    # model = model.merge_and_unload() # 可选：如果显存足够且不再训练，可以取消注释以提高微小速度
    model.eval() # Set to evaluation mode
    print(f"Loaded fine-tuned model from {adapter_path}")

    # --- Load Test Data (e.g., test1.json) ---
    if not os.path.exists(config.TEST_INPUT_FILE):
        print(f"Test input file {config.TEST_INPUT_FILE} not found. Exiting.")
        exit()

    with open(config.TEST_INPUT_FILE, 'r', encoding='utf-8') as f:
        test_data = json.load(f) # Expecting list of {"id": ..., "content": ...}

    # --- 设置批次大小 ---
    # 这是关键参数，根据你的 GPU 显存和模型大小进行调整
    # 比如 24GB GPU + 7B 4bit 模型，可以尝试 4-8，甚至更大如果显存允许
    BATCH_SIZE = 32 # 示例值，请务必根据你的硬件情况调整！

    all_predictions_output = []
    print(f"Generating predictions for {len(test_data)} items from {config.TEST_INPUT_FILE} with batch_size={BATCH_SIZE}...")

    # --- 批量生成预测 ---
    # 使用 tqdm 包装批次循环，显示整体进度
    for i in tqdm(range(0, len(test_data), BATCH_SIZE), desc="Generating Predictions"):
        # 获取当前批次的样本
        batch_items = test_data[i:i + BATCH_SIZE]
        batch_comment_texts = [item["content"] for item in batch_items]
        batch_ids = [item.get("id", "N/A") for item in batch_items]

        # 调用批量生成函数
        batch_prediction_strings = generate_prediction_for_batch(model, tokenizer, batch_comment_texts)

        # 收集每个样本的预测结果
        for j, pred_str in enumerate(batch_prediction_strings):
            all_predictions_output.append({
                "id": batch_ids[j],
                "predicted_output": pred_str,
                "original_content": batch_comment_texts[j]
            })

    # --- 保存预测结果 ---
    os.makedirs(os.path.dirname(config.TEST_PREDICTIONS_FILE), exist_ok=True)

    # 保存为 JSONL 文件 (每行一个 JSON 对象)
    with open(config.TEST_PREDICTIONS_FILE, 'w', encoding='utf-8') as f:
        for pred_obj in all_predictions_output:
            f.write(json.dumps(pred_obj, ensure_ascii=False) + '\n')

    print(f"\nAll predictions for {config.TEST_INPUT_FILE} saved to {config.TEST_PREDICTIONS_FILE}")
    print(f"Each line in the output file is a JSON object containing 'id', 'original_content', and 'predicted_output'.")

  warn(


Loaded fine-tuned model from ./results_quad_extraction/final_lora_adapter
Generating predictions for 2000 items from test1.json with batch_size=32...


Generating Predictions: 100%|██████████| 63/63 [36:06<00:00, 34.39s/it]


All predictions for test1.json saved to ./results_quad_extraction/test1_predictions.txt
Each line in the output file is a JSON object containing 'id', 'original_content', and 'predicted_output'.





In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import json
import os

import config
from optimized_prompt_template import OPTIMIZED_PROMPT_CORE_INSTRUCTIONS

def generate_prediction_for_batch(model, tokenizer, comment_texts):
    # 构造用户提示，去掉 few-shot，仅保留核心 instructions
    prompts = [
        tokenizer.apply_chat_template(
            [
                {"role": "system", "content": OPTIMIZED_PROMPT_CORE_INSTRUCTIONS},
                {"role": "user", "content": f'\n\n输入: "{text}"\n输出:'}
            ],
            tokenize=False,
            add_generation_prompt=True
        ) for text in comment_texts
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=getattr(config, 'MAX_SEQ_LENGTH', 2048)
    ).to(model.device)

    input_ids_len = inputs.input_ids.shape[1]

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=256,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False,
            temperature=0.7,
        )

    results = []
    for i in range(len(comment_texts)):
        generated_ids = outputs[i][input_ids_len:]
        decoded = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

        # 只保留四元组部分，去掉任何模型思考过程
        if "输出:" in decoded:
            prediction = decoded.split("输出:")[-1].strip()
        else:
            prediction = decoded.strip()

        # 补充 [END]，防止未生成
        if not prediction.endswith("[END]"):
            prediction += " [END]"

        results.append(prediction)

    return results

if __name__ == "__main__":
    adapter_path = os.path.join(config.OUTPUT_DIR, "final_lora_adapter")
    if not os.path.exists(adapter_path):
        raise FileNotFoundError(f"Fine-tuned adapter not found at {adapter_path}. Please run train.py first.")

    tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    compute_dtype_inf = getattr(torch, config.BNB_4BIT_COMPUTE_DTYPE)
    bnb_config_inf = None
    if config.USE_4BIT_QUANTIZATION:
        bnb_config_inf = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type=config.BNB_4BIT_QUANT_TYPE,
            bnb_4bit_compute_dtype=compute_dtype_inf,
        )

    base_model = AutoModelForCausalLM.from_pretrained(
        config.BASE_MODEL_ID,
        quantization_config=bnb_config_inf,
        torch_dtype=compute_dtype_inf if bnb_config_inf else torch.bfloat16,
        device_map={"": 0},
        trust_remote_code=True,
    )

    model = PeftModel.from_pretrained(base_model, adapter_path)
    model.eval()
    print(f"✅ 模型已加载：{adapter_path}")

    # 加载测试数据，仅取前5条
    if not os.path.exists(config.TEST_INPUT_FILE):
        raise FileNotFoundError(f"Test input file {config.TEST_INPUT_FILE} not found.")

    with open(config.TEST_INPUT_FILE, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    test_data = raw_data[:5]

    comment_texts = [item["content"] for item in test_data]
    predictions = generate_prediction_for_batch(model, tokenizer, comment_texts)

    print("\n=== 模型输出结果（仅四元组） ===")
    for result in predictions:
        print(result)


  warn(
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend