In [1]:
from pathlib import Path
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig
from trl import DPOTrainer

BASE_MODEL_PATH = "autodl-tmp/Qwen2.5-7B-Instruct"  # 或你 merge 了 SFT 的版本
DPO_DATA_PATH = "autodl-tmp/data/prof_student_cold_email_dpo.jsonl"
OUTPUT_DIR = "autodl-tmp/qwen25-mail-dpo-lora"

In [2]:
# 1. 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    use_fast=False,
    local_files_only=True,
)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2. 加载 base 模型（做 DPO 的 policy & ref 都从这里来）
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    local_files_only=True,
)
model.config.use_cache = False


lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)


raw_ds = load_dataset(
    "json",
    data_files=DPO_DATA_PATH,
    split="train",
)


def build_text(prompt, response):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an AI assistant that writes polite, well-structured, and field-aware cold emails "
                "from students to professors. Always refer accurately to the professor's research interests "
                "and the student's background."
            ),
        },
        {
            "role": "user",
            "content": prompt,
        },
        {
            "role": "assistant",
            "content": response,
        },
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )

def map_for_dpo(example):
    prompt = example["prompt"]
    chosen = example["chosen"]
    rejected = example["rejected"]
    return {
        "prompt": prompt,
        "chosen": build_text(prompt, chosen),
        "rejected": build_text(prompt, rejected),
    }

dpo_ds = raw_ds.map(
    map_for_dpo,
    remove_columns=raw_ds.column_names,
)


from trl import DPOTrainer, DPOConfig

# 1. 用 DPOConfig 代替 TrainingArguments
dpo_config = DPOConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-5,
    bf16=True,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    report_to="none",
)

# 2. 用 DPOConfig 创建 DPOTrainer
dpo_trainer = DPOTrainer(
    model=model,          # 已经 from_pretrained 载好的 Qwen
    ref_model=None,       # None = 自动 clone 一个 frozen ref model
    args=dpo_config,
    train_dataset=dpo_ds, # 里边有 "prompt", "chosen", "rejected"
    peft_config=lora_config
)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [3]:
dpo_trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
20,0.2898
40,0.001
60,0.0001
80,0.0001
100,0.0001


TrainOutput(global_step=100, training_loss=0.0582102489261888, metrics={'train_runtime': 633.9129, 'train_samples_per_second': 1.262, 'train_steps_per_second': 0.158, 'total_flos': 0.0, 'train_loss': 0.0582102489261888, 'epoch': 2.0})

In [4]:
adapter_dir = Path(OUTPUT_DIR) / "adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)

dpo_trainer.model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(OUTPUT_DIR)

print("DPO LoRA adapter saved to:", adapter_dir)

DPO LoRA adapter saved to: autodl-tmp/qwen25-mail-dpo-lora/adapter
