# Chapter 12 - Fine-tuning Generation Models
探索两步走的方法来微调生成模型


In [1]:
%%capture
!pip install -q accelerate peft bitsandbytes transformers trl sentencepiece

In [None]:
# import os

# os.environ["HF_HOME"] = "/openbayes/home/huggingface"

## 12.1 Supervised Fine-Tuning (SFT)

### 12.1.1 数据处理

In [6]:
from transformers import AutoTokenizer
from datasets import load_dataset


# 加载一个 tokenizer 来使用它的聊天模板
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

def format_prompt(example):
    chat = [
        {"role": "system", "content": "你是一个非常棒的人工智能助手，猪猪开发的"},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["target"]}
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {"text": prompt}

# Load and format the data using the template TinyLLama is using
dataset = load_dataset("YeungNLP/firefly-train-1.1M", split="train[:500]")

dataset = dataset.map(format_prompt)

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
# Example of formatted prompt
print(dataset["text"][99])

<|im_start|>system
你是一个非常棒的人工智能助手，猪猪开发的<|im_end|>
<|im_start|>user
讲一个童话故事，题目为哲理故事：老师讲的故事<|im_end|>
<|im_start|>assistant
老师已退休，现居乡下。
一天，几个同学聚到一起，讲起了他们的老师，在校时，他们都曾受到老师的器重，现在，他们大小都成了点气候。有人提议，教师节那天，去看看老师吧，要带上自己的车，给老师壮壮脸儿。提议得到了几位同学的赞同。他们都是有车的人，虽然都是公车。
教师节这天，他们坐着自己的车，先后到了老师的家，唯独小喜还没到。小喜是老师这个村所在乡的乡长，上任还不到半个月。一个戴眼镜的同学不满地说，小喜最近来得最慢。眼镜说着，就拨打小喜的手机，但总占线，眼镜就一遍遍重拨。他终于与小喜联系上了。打过手机，眼镜说，小喜马上到。
说话间，就有车声传来。他们迎到门口时，小喜的桑塔纳已停院门口了。小喜下了车说，对不起，对不起，路上遇到了点儿麻烦。
老师关切地问，啥麻烦？
小喜入坐后说，到这个乡报到后，秘书建议我换台车，要不就换换车牌子。我想，换了好像要与前任乡长势不两立一样，影响不好，就没换。今天，我坐这车来到刘村村口，有台拖拉机迎面开来，路虽不宽，但完全可以会车。可拖拉机硬是占着路中间不靠边。拖拉机手瞪着眼，凶凶地看我们。司机让他往一边靠靠，他脖子一拧说，好人不给坏人让路。我听了这话感到好没道理，就下车问他，你认识我吗？他摇头。我说，你不认识我怎么说我是坏人？那人看一下车牌说，坐这车的没好人。司机说，这是新来的乡长。乡长有急事，你这位大哥行行好，让我们过去吧。那人看看我，很不情愿地让了路。路上，司机给我吐露了真情，原乡长看上了这个村的两个女人，乡长夜间经常自己驾车在村头轮换着与这两个女人约会，这车成了那乡长的活动别墅。我恍然大悟：难怪秘书建议我换车换牌号呢。
小喜的话引起了同学们的兴趣。眼镜说，那开拖拉机的恐怕是乡长情人的丈夫，他没误你一顿就不错了。
众人哄一下笑起来。
老师摆了家宴，请学生入座。酒过三巡，话过情谊，话题不知怎么就又拐到了车上。同学又讲了几个有关车子的奇话、趣话。老师只是听。一个学生怕冷落了老师，就打断话题，给老师让酒。老师喝了酒说，我也讲个车的故事吧。学生们连声说好。
老师问。知道农科所的老黄吗？
学生们说知道，知道，有名的玉

### 12.1.2 Models - Quantization

In [8]:
# 量化，为了省显存
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",

    # Leave this out for regular SFT
    quantization_config=bnb_config,
)
model.config.use_cache = False
model.config.pretraining_tp = 1  # 上面这两个配置，只有在 k-bit 量化的时候需要设置

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = "<PAD>"  # qwen2 的 pad token 不是 <pad>，所以用 <im_end>，因此需要注释掉
tokenizer.padding_side = "left"   # 训练不重要，推理比较重要

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

### 12.1.3 配置

#### 12.1.3.1 LoRA 配置

In [9]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # Rank，可训练数据越多，设置越大
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=  ['k_proj', 'v_proj', 'q_proj']
    # Layers to target
    #  ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model)

# 如果没有 prepare_model_for_kbit_training，
# 且 training args 中配置了 gradient_checkpointing=True （这个其实也是为了省显存，其实不重要）
# 那么需要设置 model.enable_input_require_grads()
# model.enable_input_require_grads()

model = get_peft_model(model, peft_config)

#### 12.1.3.2 训练配置

In [13]:
from transformers import TrainingArguments

output_dir = "./results"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    report_to="none"
)

### 12.1.4 Training!

In [14]:
from trl import SFTTrainer

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",   # 注意 dataset 中的 text 字段
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=512,

    # Leave this out for regular SFT
    peft_config=peft_config,

)

# Train model
trainer.train()

# Save QLoRA weights
trainer.model.save_pretrained("qwen2.5-0.5b-instruct-chaofa")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
10,54.9764
20,11.3287
30,21.2197
40,15.8116
50,37.4525
60,109.0644


#### 12.1.4.1 Merge Adapter (LoRA 和 base model 合并)

In [16]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "qwen2.5-0.5b-instruct-chaofa",
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

#### 12.1.4.2 Inference

In [18]:
from transformers import pipeline

pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)

prompt_example = """<|im_start|>system
你是一个非常棒的人工智能助手，是UP主 “用代码打点酱油的chaofa” 开发的。<|im_end|>
<|im_start|>user
今天我太馋了，所以我今天没有学习一点。
翻译成文言文：<|im_end|>
<|im_start|>assistant
"""

print(pipe(prompt_example, max_new_tokens=50)[0]["generated_text"])

<|im_start|>system
你是一个非常棒的人工智能助手，是UP主 “用代码打点酱油的chaofa” 开发的。<|im_end|>
<|im_start|>user
今天我太馋了，所以我今天没有学习一点。
翻译成文言文：<|im_end|>
<|im_start|>assistant
此日馋心过甚，故今未加学业。


## 12.2 Preference Tuning (PPO/DPO)

### 12.2.1 Data Preprocessing

In [None]:
from datasets import load_dataset

def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLLama is using"""

    # Format answers
    system = "<|system|>\n" + example['system'] + "</s>\n"
    prompt = "<|user|>\n" + example['input'] + "</s>\n<|assistant|>\n"
    chosen = example['chosen'] + "</s>\n"
    rejected = example['rejected'] + "</s>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Apply formatting to the dataset and select relatively short answers
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
    lambda r:
        r["status"] != "tie" and
        r["chosen_score"] >= 8 and
        not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
dpo_dataset

Downloading readme:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12859 [00:00<?, ? examples/s]

Map:   0%|          | 0/5922 [00:00<?, ? examples/s]

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 5922
})

### 12.2.2 Models - Quantization

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
)

# Merge LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto",
    quantization_config=bnb_config,
)
merged_model = model.merge_and_unload()

# Load LLaMA tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"



### 12.2.3 Configuration

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # Rank
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=  # Layers to target
     ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
from trl import DPOConfig

output_dir = "./results"

# Training arguments
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.1
)

In [None]:
from trl import DPOTrainer

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_arguments,
    train_dataset=dpo_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=512,
    max_length=512,
)

# Fine-tune model with DPO
dpo_trainer.train()

# Save adapter
dpo_trainer.model.save_pretrained("TinyLlama-1.1B-dpo-qlora")


Deprecated positional argument(s) used in DPOTrainer, please use the DPOConfig to set these arguments instead.


Map:   0%|          | 0/5922 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
10,0.6924
20,0.6782
30,0.646
40,0.6063
50,0.5956
60,0.6168
70,0.5937
80,0.5319
90,0.5592
100,0.639


In [None]:
from peft import PeftModel

# Merge LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto",
)
sft_model = model.merge_and_unload()

# Merge DPO LoRA and SFT model
dpo_model = PeftModel.from_pretrained(
    sft_model,
    "TinyLlama-1.1B-dpo-qlora",
    device_map="auto",
)
dpo_model = dpo_model.merge_and_unload()

In [None]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
"""

# Run our instruction-tuned model
pipe = pipeline(task="text-generation", model=dpo_model, tokenizer=tokenizer)
print(pipe(prompt)[0]["generated_text"])

<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
Large Language Models (LLMs) are a type of artificial intelligence (AI) that can generate human-like language. They are trained on large amounts of data, including text, audio, and video, and are capable of generating complex and nuanced language.

LLMs are used in a variety of applications, including natural language processing (NLP), machine translation, and chatbots. They can be used to generate text, speech, or images, and can be trained to understand different languages and dialects.

One of the most significant applications of LLMs is in the field of natural language generation (NLG). LLMs can be used to generate text in a variety of languages, including English, French, and German. They can also be used to generate speech, such as in chatbots or voice assistants.

LLMs have the potential to revolutionize the way we communicate and interact with each other. They can help us create more engaging and personal