### 1. DPO

In [1]:
import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [2]:
import torch
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig # type: ignore
from transformers import TrainingArguments
from typing import Dict, List

In [3]:
def prepare_dpo_dataset(
    train_file: str = "../data_preparation/ceval-exam/train_data.json",
    val_file: str = "../data_preparation/ceval-exam/val_data.json"
) -> Dict:
    """
    准备DPO训练所需的数据集
    
    Args:
        train_file: 训练集文件路径
        val_file: 验证集文件路径
    Returns:
        包含处理后数据集的字典
    """
    # 直接读取JSON文件
    import json
    
    # 加载训练集
    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    # 加载验证集
    with open(val_file, 'r', encoding='utf-8') as f:
        val_data = json.load(f)
    
    def process_function(examples):
        return {
            "prompt": examples["prompt"],
            "chosen": examples["extracted"], 
            "rejected": examples["model_response"],  # 模型的实际回答作为rejected response
        }
    
    # print(type(train_dataset), train_dataset) # <class 'datasets.arrow_dataset.Dataset'> Dataset({features: ['data'], num_rows: 1107})

    # 处理数据集
    # 转换为Dataset对象
    from datasets import Dataset
    train_dataset = Dataset.from_list(train_data["data"]).map(process_function)
    eval_dataset = Dataset.from_list(val_data["data"]).map(process_function)
    # train_dataset = train_dataset.map(process_function)
    # eval_dataset = eval_dataset.map(process_function)
    
    return {
        "train_dataset": train_dataset,
        "eval_dataset": eval_dataset
    }

In [4]:
def train_with_dpo(
    model,
    tokenizer,
    train_file: str = "../data_preparation/ceval-exam/train_data.json",
    val_file: str = "../data_preparation/ceval-exam/val_data.json",
    output_dir: str = "../../../../../../../../../../root/autodl-tmp/models/dpo_finetuned",
    batch_size: int = 2,
    gradient_accumulation_steps: int = 4,
    num_train_epochs: int = 5,
    learning_rate: float = 5e-5,
) -> None:
    """
    使用DPO方法微调模型
    """
    import torch
    
    # 确保模型完全加载到GPU
    device = torch.device("cuda")
    model = model.to(device)
    
    # 准备数据集
    datasets = prepare_dpo_dataset(train_file, val_file)
    
    # 设置DPO训练参数
    training_args = DPOConfig(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        remove_unused_columns=False, # remove_unused_columns：是否删除未使用的列 - 对于DPO训练很重要，因为它需要同时访问"chosen"和"rejected"两列数据
        logging_steps=10,

        # 添加保存检查点
        save_strategy="steps",
        save_steps=69,
        # 修改评估策略
        evaluation_strategy="steps",  # 改为按步数评估
        eval_steps=10,  # 每69步评估一次

        # 混合精度训练设置
        fp16=False,
        bf16=False,
        tf32=True,
        # 模型参数
        max_prompt_length=512,
        max_length=1024, # 限制了整个序列（prompt+response）的最大长度
        # 训练优化
        gradient_checkpointing=True, # 梯度检查点
        optim="adamw_torch", # 优化器
        max_grad_norm=1.0, # 最大梯度范数
        # 显存优化
        deepspeed=None, # 深度学习加速
        local_rank=-1, # 本地排名

        # 重要：添加评估输出
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        
        # 确保评估正确进行
        do_eval=True,
        include_inputs_for_metrics=True,
    )
    
    print("初始化DPO Trainer...")
    # 在代码开头添加导入
    from transformers import TrainerCallback
    class EvalLoggingCallback(TrainerCallback):
        def on_evaluate(self, args, state, control, metrics, **kwargs):
            print("\n评估指标:")
            for key, value in metrics.items():
                print(f"{key}: {value}")
    # 初始化DPO Trainer        
    dpo_trainer = DPOTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=datasets["train_dataset"],
        eval_dataset=datasets["eval_dataset"],
        callbacks=[EvalLoggingCallback()]
    )
    # dpo_trainer = DPOTrainer(
    #     model=model,
    #     args=training_args,
    #     tokenizer=tokenizer,
    #     train_dataset=datasets["train_dataset"],
    #     eval_dataset=datasets["eval_dataset"],
    # )
    
    # 开始训练
    print("开始DPO训练...")
    dpo_trainer.train()
    
    # 保存最终模型
    dpo_trainer.save_model(output_dir)
    print(f"训练完成! 模型已保存到 {output_dir}")

In [5]:
# 使用示例
if __name__ == "__main__":
    from transformers import AutoModelForCausalLM, AutoTokenizer
    
    # 加载模型和分词器
    model_path = "google/gemma-2-9b"
    cache_dir = "/root/autodl-tmp/gemma"
    lora_path = "../../../../../../../../../../../../../../root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-43500"
    model, tokenizer = initialize_model_and_tokenizer(
        model_path=model_path,
        cache_dir=cache_dir,
        lora_path=lora_path,
        use_quantization=False, # 开启量化会导致：RuntimeError: value cannot be converted to type at::Half without overflow"
        device_map = "cuda:0"
    )
    
    # 开始DPO训练
    train_with_dpo(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



Map:   0%|          | 0/1107 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead.


初始化DPO Trainer...


  dpo_trainer = DPOTrainer(


Extracting prompt from train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Extracting prompt from eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


开始DPO训练...




Step,Training Loss,Validation Loss


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


KeyError: 'input_ids'

### 2. 评估 - 在val数据集上遵守指令的程度

In [None]:
import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [None]:
def evaluate_model_improvement(
    base_model_path: str,
    dpo_model_path: str,
    val_file: str,
    cache_dir: str,
    batch_size: int = 4
) -> None:
    """
    评估DPO微调前后模型的表现
    
    Args:
        base_model_path: 原始模型路径
        dpo_model_path: DPO微调后的模型路径
        val_file: 验证集文件路径
        cache_dir: 缓存目录
    """
    import json
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from tqdm import tqdm
    
    # 加载验证集数据
    with open(val_file, 'r', encoding='utf-8') as f:
        val_data = json.load(f)["data"]
    
    results = []
    
    # 加载原始模型和微调后的模型
    print("加载原始模型...")
    base_model, tokenizer = initialize_model_and_tokenizer(
        model_path=base_model_path,
        cache_dir=cache_dir,
        use_quantization=False
    )
    
    print("加载DPO微调后的模型...")
    dpo_model, _ = initialize_model_and_tokenizer(
        model_path=base_model_path,
        cache_dir=cache_dir,
        lora_path=dpo_model_path,
        use_quantization=False
    )
    
    # 生成函数
    def generate_answer(model, prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 在验证集上测试两个模型
    print("开始评估...")
    for idx, item in enumerate(tqdm(val_data[:batch_size])):  # 为了快速测试，只取batch_size个样本
        prompt = item["prompt"]
        golden_answer = item["answer"]
        
        # 生成两个模型的回答
        base_answer = generate_answer(base_model, prompt)
        dpo_answer = generate_answer(dpo_model, prompt)
        
        results.append({
            "prompt": prompt,
            "golden_answer": golden_answer,
            "base_model_answer": base_answer,
            "dpo_model_answer": dpo_answer
        })
        
        print(f"\n样本 {idx + 1}:")
        print(f"Prompt: {prompt}")
        print(f"标准答案: {golden_answer}")
        print(f"原始模型答案: {base_answer}")
        print(f"DPO模型答案: {dpo_answer}")
        print("-" * 50)
    
    # 保存结果
    output_file = "model_comparison_results.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"评估结果已保存到 {output_file}")

In [None]:
# 使用示例：
if __name__ == "__main__":
    base_model_path = "google/gemma-2-9b"
    cache_dir = "/root/autodl-tmp/gemma"
    dpo_model_path = "../../../../../../../../../../../../root/autodl-tmp/models/dpo_finetuned/checkpoint-138"
    val_file = "ceval-exam/val_data.json"
    
    evaluate_model_improvement(
        base_model_path=base_model_path,
        dpo_model_path=dpo_model_path,
        val_file=val_file,
        cache_dir=cache_dir,
        batch_size=4  # 可以调整测试样本数量
    )