In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [3]:
# config.py
from dotenv import load_dotenv # type: ignore

# 加载 .env 文件
load_dotenv()

# 读取
ZetaTechs_api_key = os.getenv('ZETATECHS_API_KEY')
ZetaTechs_api_base = os.getenv('ZETATECHS_API_BASE')

### 1. 构建base数据集 - 所有val数据集的集合

In [2]:
import os
import pandas as pd
import json
from pathlib import Path

In [3]:
def process_ceval_val_datasets():
    """
    处理CEVAL验证集数据,将所有CSV文件合并为一个JSON文件
    
    数据目录结构:
    src/RLHF/data_preparation/ceval-exam/val/*.csv
    """
    
    # 设置数据根目录
    base_path = Path("ceval-exam/val")
    
    # 存储所有处理后的数据
    all_data = []
    
    # 遍历所有CSV文件
    for csv_file in base_path.glob("*.csv"):
        # 读取CSV文件
        df = pd.read_csv(csv_file)
        
        # 获取数据类别(文件名)
        category = csv_file.stem
        
        # 处理每一行数据
        for _, row in df.iterrows():
            item = {
                "category": category,
                "question": row["question"],
                "A": row["A"],
                "B": row["B"],
                "C": row["C"],
                "D": row["D"],
                "answer": row["answer"]
            }
            all_data.append(item)
    
    # 将数据保存为JSON文件
    output_path = base_path.parent / "merged_val_data.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump({
            "data": all_data
        }, f, ensure_ascii=False, indent=2)
    
    print(f"处理完成,共整合了{len(all_data)}条数据")
    print(f"数据已保存至: {output_path}")
    
    # 返回一些基本统计信息
    categories = {}
    for item in all_data:
        categories[item["category"]] = categories.get(item["category"], 0) + 1
        
    print("\n各类别数据统计:")
    for category, count in categories.items():
        print(f"{category}: {count}条")

In [None]:
if __name__ == "__main__":
    process_ceval_val_datasets()

In [None]:
import json
with open("ceval-exam/merged_val_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
dataset = data["data"]
dataset[0]

### 2. 将 merged_val_data.json 中添加整合后的prompt

In [8]:
import os
from typing import List, Dict
from tqdm import tqdm # type: ignore
import json

In [5]:
def add_prompts_to_dataset(dataset: List[Dict]) -> List[Dict]:
    """
    为数据集中的每一项添加格式化的prompt
    
    Args:
        dataset: 原始数据集列表
    
    Returns:
        List[Dict]: 添加了prompt的数据集
    """
    system_context = """你是一位逻辑推理专家。请仅用一个字母(A/B/C/D)回答问题,不需要解释。"""
    
    for item in dataset:
        # 构建user prompt
        user_prompt = f"""题目：{item['question']}

A. {item['A']}
B. {item['B']}
C. {item['C']}
D. {item['D']}

请直接回答选项字母。"""

        # 创建dialogue列表
        dialogue = [
            {"role": "system", "content": system_context},
            {"role": "user", "content": user_prompt}
        ]
        
        # 使用apply_chat_template格式化对话
        formatted_prompt = apply_chat_template(dialogue)
        
        # 将格式化后的prompt添加到数据项中
        item["prompt"] = formatted_prompt
    
    return dataset

In [6]:
def update_merged_dataset():
    """
    读取merged_val_data.json,添加prompt后重新保存
    """
    # 读取原始数据
    with open("ceval-exam/merged_val_data.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 获取数据列表
    dataset = data["data"]
    
    # 添加prompts
    dataset_with_prompts = add_prompts_to_dataset(dataset)
    
    # 更新原始数据
    data["data"] = dataset_with_prompts
    
    # 保存更新后的数据
    with open("ceval-exam/merged_val_data_with_prompts.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print("数据更新完成,已保存到merged_val_data_with_prompts.json")
    print(f"共处理{len(dataset)}条数据")

In [9]:
if __name__ == "__main__":
    update_merged_dataset()

数据更新完成,已保存到merged_val_data_with_prompts.json
共处理1346条数据


### 3. 让大模型逐个回答问题

In [10]:
import os
from typing import List, Dict
from tqdm import tqdm # type: ignore
import json

In [11]:
def process_dataset_with_model_responses(
    model, 
    tokenizer, 
    input_file: str = "ceval-exam/merged_val_data_with_prompts.json",
    output_file: str = "ceval-exam/merged_val_data_with_prompts_and_responses.json"
) -> None:
    """
    使用模型处理数据集中的每个问题，记录回答和评估结果
    
    Args:
        model: 加载的模型实例
        tokenizer: 加载的分词器实例
        input_file: 输入的JSON文件路径
        output_file: 输出的JSON文件路径
    """
    # 读取数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    dataset = data["data"]
    
    # 用于存储所有处理结果的字典
    results = {}
    
    # 使用tqdm显示进度
    for idx, item in enumerate(tqdm(dataset, desc="处理问题")):
        prompt = item["prompt"]
        
        try:
            # 使用模型生成回答
            full_response = generate_response(
                model=model,
                tokenizer=tokenizer,
                prompt=prompt,
                temperature=0.2,  # 降低随机性
                max_new_tokens=32  # 由于只需要回答选项，可以设置较小的值
            )
            
            # 提取回答(仅保留A/B/C/D)
            extracted_answer = ""
            for char in full_response:
                if char in ["A", "B", "C", "D"]:
                    extracted_answer = char
                    break
            
            # 判断回答是否正确
            is_correct = extracted_answer == item["answer"]
            
            # 更新数据项
            item.update({
                "model_response": full_response,
                "extracted": extracted_answer,
                "is_correct": is_correct
            })
            
        except Exception as e:
            print(f"处理第{idx}个问题时发生错误: {str(e)}")
            item.update({
                "model_response": "",
                "extracted": "",
                "is_correct": False
            })
    
    # 更新原始数据
    data["data"] = dataset
    
    # 保存结果
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    # 计算准确率
    correct_count = sum(1 for item in dataset if item["is_correct"])
    accuracy = correct_count / len(dataset)
    
    print(f"\n评估完成!")
    print(f"总问题数: {len(dataset)}")
    print(f"正确数量: {correct_count}")
    print(f"准确率: {accuracy:.2%}")

In [12]:
# 使用示例:
model_path = "google/gemma-2-9b"
cache_dir = "/root/autodl-tmp/gemma"
lora_path = "/root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-43500"
model, tokenizer = initialize_model_and_tokenizer(
    model_path=model_path,
    cache_dir=cache_dir,
    lora_path=lora_path,
    use_quantization=True
)

process_dataset_with_model_responses(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
处理问题: 100%|██████████| 1346/1346 [23:03<00:00,  1.03s/it]


评估完成!
总问题数: 1346
正确数量: 708
准确率: 52.60%





### 4. 判断大模型是否遵从了我们的指令，只输出ABCD

In [13]:
def add_strict_compliance_check(
    input_file: str = "ceval-exam/merged_val_data_with_prompts_and_responses.json",
    output_file: str = "ceval-exam/merged_val_data_final.json"
) -> None:
    """
    为数据集添加strict_compliance字段，判断模型回答是否严格遵守只输出ABCD的要求
    
    Args:
        input_file: 输入的JSON文件路径
        output_file: 输出的JSON文件路径
    """
    # 读取数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    dataset = data["data"]
    
    # 用于统计的计数器
    strict_compliance_count = 0
    total_count = len(dataset)
    
    # 处理每条数据
    for item in tqdm(dataset, desc="检查答案合规性"):
        response = item["model_response"].strip()
        
        # 判断是否严格遵守规则（只输出A/B/C/D中的一个）
        is_strict = False
        if response in ["A", "B", "C", "D"]:
            is_strict = True
            strict_compliance_count += 1
            
        # 添加新字段
        item["strict_compliance"] = is_strict
    
    # 保存更新后的数据
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    # 输出统计信息
    compliance_rate = strict_compliance_count / total_count
    print("\n合规性检查完成!")
    print(f"总样本数: {total_count}")
    print(f"严格合规样本数: {strict_compliance_count}")
    print(f"合规率: {compliance_rate:.2%}")
    
    # 输出一些不合规的示例，方便分析
    print("\n不合规示例:")
    non_compliant_samples = [item for item in dataset if not item["strict_compliance"]]
    for i, sample in enumerate(non_compliant_samples[:5], 1):  # 只显示前5个
        print(f"\n示例{i}:")
        print(f"模型回答: {sample['model_response']}")

In [14]:
add_strict_compliance_check()

检查答案合规性: 100%|██████████| 1346/1346 [00:00<00:00, 731950.37it/s]


合规性检查完成!
总样本数: 1346
严格合规样本数: 1
合规率: 0.07%

不合规示例:

示例1:
模型回答: 答案是 D。

示例2:
模型回答: 答案为 C. 紧缩与集中战略。

示例3:
模型回答: D. 合同的当事人

示例4:
模型回答: 答案是 B. 30

示例5:
模型回答: 答案是 C，因为战略管理不是一次性的工作，而是需要持续进行的动态过程。





### 5. 过滤并划分数据集 

In [17]:
def split_dataset(
    input_file: str = "ceval-exam/merged_val_data_final.json",
    train_file: str = "ceval-exam/train_data.json",
    val_file: str = "ceval-exam/val_data.json",
    val_ratio: float = 0.2,
    random_seed: int = 42
) -> None:
    """
    将数据集划分为训练集和验证集
    - 过滤掉strict_compliance为true的数据
    - 保持各类别数据的比例
    - 输出详细的类别分布统计
    """
    # 设置随机种子
    import random
    random.seed(random_seed)
    
    # 读取数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 过滤掉strict_compliance为true的数据
    dataset = [item for item in data["data"] if not item["strict_compliance"]]
    print(f"过滤前数据量: {len(data['data'])}")
    print(f"过滤后数据量: {len(dataset)}")
    
    # 按类别分组
    category_data = {}
    for item in dataset:
        category = item["category"]
        if category not in category_data:
            category_data[category] = []
        category_data[category].append(item)
    
    # 为每个类别划分数据
    train_data = []
    val_data = []
    
    for category, items in category_data.items():
        # 打乱该类别的数据
        random.shuffle(items)
        
        # 计算验证集大小
        val_size = max(1, int(len(items) * val_ratio))  # 确保每个类别至少有1个验证样本
        
        # 划分数据
        val_data.extend(items[:val_size])
        train_data.extend(items[val_size:])
    
    # 最后再次打乱整个训练集和验证集
    random.shuffle(train_data)
    random.shuffle(val_data)
    
    # 保存数据集
    with open(train_file, "w", encoding="utf-8") as f:
        json.dump({"data": train_data}, f, ensure_ascii=False, indent=2)
    with open(val_file, "w", encoding="utf-8") as f:
        json.dump({"data": val_data}, f, ensure_ascii=False, indent=2)
    
    # 输出统计信息
    print("\n数据集划分完成!")
    print(f"总样本数: {len(dataset)}")
    print(f"训练集样本数: {len(train_data)}")
    print(f"验证集样本数: {len(val_data)}")
    
    print("\n类别分布统计:")
    for category in category_data.keys():
        # 计算该类别在训练集和验证集中的数量
        train_count = len([x for x in train_data if x["category"] == category])
        val_count = len([x for x in val_data if x["category"] == category])
        total_category_count = train_count + val_count
        
        # 计算该类别占总体的比例
        train_category_ratio = train_count / len(train_data)
        val_category_ratio = val_count / len(val_data)
        
        print(f"\n{category}:")
        print(f"  训练集: {train_count}条")
        print(f"    - 占该类别总数的 {train_count/total_category_count:.1%}")
        print(f"    - 占训练集总数的 {train_category_ratio:.1%}")
        print(f"  验证集: {val_count}条")
        print(f"    - 占该类别总数的 {val_count/total_category_count:.1%}")
        print(f"    - 占验证集总数的 {val_category_ratio:.1%}")

In [18]:
if __name__ == "__main__":
    split_dataset()

过滤前数据量: 1346
过滤后数据量: 1345

数据集划分完成!
总样本数: 1345
训练集样本数: 1107
验证集样本数: 238

类别分布统计:

accountant_val:
  训练集: 40条
    - 占该类别总数的 81.6%
    - 占训练集总数的 3.6%
  验证集: 9条
    - 占该类别总数的 18.4%
    - 占验证集总数的 3.8%

advanced_mathematics_val:
  训练集: 16条
    - 占该类别总数的 84.2%
    - 占训练集总数的 1.4%
  验证集: 3条
    - 占该类别总数的 15.8%
    - 占验证集总数的 1.3%

art_studies_val:
  训练集: 27条
    - 占该类别总数的 81.8%
    - 占训练集总数的 2.4%
  验证集: 6条
    - 占该类别总数的 18.2%
    - 占验证集总数的 2.5%

basic_medicine_val:
  训练集: 16条
    - 占该类别总数的 84.2%
    - 占训练集总数的 1.4%
  验证集: 3条
    - 占该类别总数的 15.8%
    - 占验证集总数的 1.3%

business_administration_val:
  训练集: 27条
    - 占该类别总数的 81.8%
    - 占训练集总数的 2.4%
  验证集: 6条
    - 占该类别总数的 18.2%
    - 占验证集总数的 2.5%

chinese_language_and_literature_val:
  训练集: 19条
    - 占该类别总数的 82.6%
    - 占训练集总数的 1.7%
  验证集: 4条
    - 占该类别总数的 17.4%
    - 占验证集总数的 1.7%

civil_servant_val:
  训练集: 38条
    - 占该类别总数的 80.9%
    - 占训练集总数的 3.4%
  验证集: 9条
    - 占该类别总数的 19.1%
    - 占验证集总数的 3.8%

clinical_medicine_val:
  训练集: 18条
    - 占该类别总数的 81.8%


### 6. DPO

In [1]:
import sys
import os

# 添加项目根目录到Python路径
project_root = "/home/cuipeng/Gemma"
sys.path.append(project_root)

# 导入必要模块
from src.core.model.model_initializer import initialize_model_and_tokenizer
from src.core.utils.model_utils import generate_response, apply_chat_template

In [2]:
import torch
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig # type: ignore
from transformers import TrainingArguments
from typing import Dict, List

In [3]:
def prepare_dpo_dataset(
    train_file: str = "ceval-exam/train_data.json",
    val_file: str = "ceval-exam/val_data.json"
) -> Dict:
    """
    准备DPO训练所需的数据集
    
    Args:
        train_file: 训练集文件路径
        val_file: 验证集文件路径
    Returns:
        包含处理后数据集的字典
    """
    # 直接读取JSON文件
    import json
    
    # 加载训练集
    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    # 加载验证集
    with open(val_file, 'r', encoding='utf-8') as f:
        val_data = json.load(f)
    
    def process_function(examples):
        return {
            "prompt": examples["prompt"],
            "chosen": examples["extracted"], 
            "rejected": examples["model_response"],  # 模型的实际回答作为rejected response
        }
    
    # print(type(train_dataset), train_dataset) # <class 'datasets.arrow_dataset.Dataset'> Dataset({features: ['data'], num_rows: 1107})

    # 处理数据集
    # 转换为Dataset对象
    from datasets import Dataset
    train_dataset = Dataset.from_list(train_data["data"]).map(process_function)
    eval_dataset = Dataset.from_list(val_data["data"]).map(process_function)
    # train_dataset = train_dataset.map(process_function)
    # eval_dataset = eval_dataset.map(process_function)
    
    return {
        "train_dataset": train_dataset,
        "eval_dataset": eval_dataset
    }


In [4]:
def train_with_dpo(
    model,
    tokenizer,
    train_file: str = "ceval-exam/train_data.json",
    val_file: str = "ceval-exam/val_data.json",
    output_dir: str = "../../../../../../../../../root/autodl-tmp/models/dpo_finetuned",
    batch_size: int = 2,
    gradient_accumulation_steps: int = 4,
    num_train_epochs: int = 5,
    learning_rate: float = 5e-5,
) -> None:
    """
    使用DPO方法微调模型
    """
    import torch
    
    # 确保模型完全加载到GPU
    device = torch.device("cuda")
    model = model.to(device)
    
    # 准备数据集
    datasets = prepare_dpo_dataset(train_file, val_file)
    
    # 设置DPO训练参数
    training_args = DPOConfig(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        remove_unused_columns=False,
        logging_steps=10,

        # 添加保存检查点
        save_strategy="steps",
        save_steps=69,
        # 修改评估策略
        evaluation_strategy="steps",  # 改为按步数评估
        eval_steps=69,  # 每69步评估一次

        # 混合精度训练设置
        fp16=False,
        bf16=False,
        tf32=True,
        # 模型参数
        max_prompt_length=512,
        max_length=1024,
        # 训练优化
        gradient_checkpointing=True,
        optim="adamw_torch",
        max_grad_norm=1.0,
        # 显存优化
        deepspeed=None,
        local_rank=-1,
    )
    
    print("初始化DPO Trainer...")
    # 初始化DPO Trainer
    dpo_trainer = DPOTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=datasets["train_dataset"],
        eval_dataset=datasets["eval_dataset"],
    )
    
    # 开始训练
    print("开始DPO训练...")
    dpo_trainer.train()
    
    # 保存最终模型
    dpo_trainer.save_model(output_dir)
    print(f"训练完成! 模型已保存到 {output_dir}")

In [5]:
# 使用示例
if __name__ == "__main__":
    from transformers import AutoModelForCausalLM, AutoTokenizer
    
    # 加载模型和分词器
    model_path = "google/gemma-2-9b"
    cache_dir = "/root/autodl-tmp/gemma"
    lora_path = "/root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-43500"
    model, tokenizer = initialize_model_and_tokenizer(
        model_path=model_path,
        cache_dir=cache_dir,
        lora_path=lora_path,
        use_quantization=False, # 开启量化会导致：RuntimeError: value cannot be converted to type at::Half without overflow"
        device_map = "cuda:0"
    )
    
    # 开始DPO训练
    train_with_dpo(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



Map:   0%|          | 0/1107 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

初始化DPO Trainer...


  dpo_trainer = DPOTrainer(


Extracting prompt from train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Extracting prompt from eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1107 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/238 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


开始DPO训练...




Step,Training Loss,Validation Loss


### 7. 评估 - 在val数据集上遵守指令的程度

In [None]:
def evaluate_model_improvement(
    base_model_path: str,
    dpo_model_path: str,
    val_file: str,
    cache_dir: str,
    batch_size: int = 4
) -> None:
    """
    评估DPO微调前后模型的表现
    
    Args:
        base_model_path: 原始模型路径
        dpo_model_path: DPO微调后的模型路径
        val_file: 验证集文件路径
        cache_dir: 缓存目录
    """
    import json
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from tqdm import tqdm
    
    # 加载验证集数据
    with open(val_file, 'r', encoding='utf-8') as f:
        val_data = json.load(f)["data"]
    
    results = []
    
    # 加载原始模型和微调后的模型
    print("加载原始模型...")
    base_model, tokenizer = initialize_model_and_tokenizer(
        model_path=base_model_path,
        cache_dir=cache_dir,
        use_quantization=False
    )
    
    print("加载DPO微调后的模型...")
    dpo_model, _ = initialize_model_and_tokenizer(
        model_path=base_model_path,
        cache_dir=cache_dir,
        lora_path=dpo_model_path,
        use_quantization=False
    )
    
    # 生成函数
    def generate_answer(model, prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 在验证集上测试两个模型
    print("开始评估...")
    for idx, item in enumerate(tqdm(val_data[:batch_size])):  # 为了快速测试，只取batch_size个样本
        prompt = item["prompt"]
        golden_answer = item["answer"]
        
        # 生成两个模型的回答
        base_answer = generate_answer(base_model, prompt)
        dpo_answer = generate_answer(dpo_model, prompt)
        
        results.append({
            "prompt": prompt,
            "golden_answer": golden_answer,
            "base_model_answer": base_answer,
            "dpo_model_answer": dpo_answer
        })
        
        print(f"\n样本 {idx + 1}:")
        print(f"Prompt: {prompt}")
        print(f"标准答案: {golden_answer}")
        print(f"原始模型答案: {base_answer}")
        print(f"DPO模型答案: {dpo_answer}")
        print("-" * 50)
    
    # 保存结果
    output_file = "model_comparison_results.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"评估结果已保存到 {output_file}")

In [None]:
# 使用示例：
if __name__ == "__main__":
    base_model_path = "google/gemma-2b-it"
    cache_dir = "/root/autodl-tmp/gemma"
    dpo_model_path = "../../../../../../../../../../../../root/autodl-tmp/models/dpo_finetuned"
    val_file = "ceval-exam/val_data.json"
    
    evaluate_model_improvement(
        base_model_path=base_model_path,
        dpo_model_path=dpo_model_path,
        val_file=val_file,
        cache_dir=cache_dir,
        batch_size=4  # 可以调整测试样本数量
    )