1.数据处理

In [2]:
from datasets import load_dataset
from huggingface_hub import login
import tqdm
dataset = load_dataset("YeungNLP/firefly-train-1.1M",split="train[:500]")


Using the latest cached version of the dataset since YeungNLP/firefly-train-1.1M couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\wangx\.cache\huggingface\datasets\YeungNLP___firefly-train-1.1_m\default\0.0.0\92947564f0b6bac44c405272df8cd7247937fc2d (last modified on Tue Feb 25 16:44:54 2025).


In [3]:
dataset,dataset[100]

(Dataset({
     features: ['kind', 'input', 'target'],
     num_rows: 500
 }),
 {'kind': 'ClassicalChinese',
  'input': '我当时在三司，访求太祖、仁宗的手书敕令没有见到，然而人人能传诵那些话，禁止私盐的建议也最终被搁置。\n翻译成文言文：',
  'target': '余时在三司，求访两朝墨敕不获，然人人能诵其言，议亦竟寝。'})

需要转换目标格式


#### dataset.map()
<small>可以用于批量处理数据，map里面第一个参数传入的是函数，函数只需要指定好每一个data所需要执行的指令就可以。  
dataset还可以进行并行加速new_dataset = dataset.map(preprocess_function, num_proc=4)使用四个cpu加速。  
使用gpu需要再次构造函数</small>

#### 量化 Models-Quantization

In [6]:
import torch
from transformers import AutoModelForCausalLM,BitsAndBytesConfig,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
def format_prompt(example):
    chat = [
       {"role": "system", "content": "你是一个非常棒的人工智能助手，是Axel开发的"},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["target"]}
    ]
    # tokenize = false 表明输出字符串而不是 数字矩阵
    prompt = tokenizer.apply_chat_template(chat,tokenize=False)
    return {"text":prompt}
dataset = dataset.map(format_prompt)


tokenizer.padding_side = "left"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",#nf4（Normalized Float 4）：非均匀 4-bit 量化，适用于 LLM。
    bnb_4bit_compute_dtype="float16",#float16（推荐 ✅）：适用于大多数 GPU。
    bnb_4bit_use_double_quant=True
    #第一步：把 FP16 的模型 量化到 4-bit (nf4)。
    #第二步：对 量化后的数据 再次进行 二次量化，进一步减少显存占用。
    #适用于极限显存环境（如 8GB, 12GB GPU）。
    #✅ 如果你的显存有限，推荐开启 
)




Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map = "auto",quantization_config=bnb_config)

# 在lora微调时候，使用KV cache会导致梯度无法回传
model.config.use.cache = False
model.config.pretraining_tp = 1 # 上面这两个配置，只有在 k-bit 量化的时候需要设置

CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

#### 

Lora配置  

<small>公式为alpha / rank </small>

In [None]:
from peft import LoraConfig,prepare_model_for_kbit_training,get_peft_model


peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none"
    task_type="CAUSAL_LM"
    target_modules=['k_proj', 'v_proj', 'q_proj']#你只对 Q, K, V 投影矩阵 应用了 LoRA，意味着只调整 注意力层，不会影响 MLP 层,微调不需要更改MLP层
)
# prepare model for training
model = prepare_model_for_kbit_training(model)

# 如果没有 prepare_model_for_kbit_training，
# 且 training args 中配置了 gradient_checkpointing=True （这个其实也是为了省显存，其实不重要）
# 那么需要设置 model.enable_input_require_grads()
# model.enable_input_require_grads()

model = get_peft_model(model, peft_config)


训练配置

In [None]:
from transformers import TrainingArguments

output_dir = "./results"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)

In [None]:
from trl import SFTTrainer

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",   # 注意 dataset 中的 text 字段
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=512,

    # Leave this out for regular SFT
    peft_config=peft_config,
)

# Train model
trainer.train()

# Save QLoRA weights
trainer.model.save_pretrained("qwen2.5-0.5b-instruct-test")