1. 安装依赖并设置环境

In [1]:
!pip install transformers peft datasets bitsandbytes accelerate -U

from google.colab import drive
drive.mount('/content/drive')


Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-non

2. 准备训练数据

In [2]:
# 解压包含狗狗介绍文本的压缩包到指定文件夹
!unzip -q /content/wiki_dogs.zip -d /content/dogs

import os
import json

folder_path = '/content/dogs'
jsonl_path = '/content/dog_lora_train.jsonl'
train_data = []

# 遍历文件夹中所有狗狗文本文件，构建 instruction-output 格式的数据
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        dog_name = os.path.splitext(filename)[0]  # 文件名去掉后缀作为狗狗名称
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            desc = f.read().strip()
        # 构建包含指令和回答的字典条目
        item = {
            "instruction": f"介绍一下{dog_name}。",
            "input": "",  # 无额外输入信息
            "output": desc
        }
        train_data.append(item)

# 将数据保存为 JSONL 文件
with open(jsonl_path, 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"数据集构建完成，共{len(train_data)}条示例，已保存到 {jsonl_path}")


数据集构建完成，共773条示例，已保存到 /content/dog_lora_train.jsonl


3. 加载预训练模型和分词器

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_DIR = "/content/drive/MyDrive/硕士第二学期/先进软件技术/DeepSeek_R1_Distill_Qwen_1_5B"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

4. 配置 LoRA 参数

In [9]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()  # 验证 LoRA 参数数目

trainable params: 1,089,536 || all params: 1,778,177,536 || trainable%: 0.0613


5. 加载并预处理数据集

In [12]:
from datasets import load_dataset

# ① 定义 JSONL 数据集路径
out_path = '/content/dog_lora_train.jsonl'

# ② 正确加载 JSONL 数据（builder 用 "json"）
ds = load_dataset(
    "json",
    data_files={'train': out_path},
    split="train"
)

def preprocess_fn(example):
    instr = example["instruction"]
    resp  = example["output"]
    full  = instr + resp

    # 对 full 文本做 分词+截断+固定长度填充
    tok_full = tokenizer(
        full,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    input_ids    = tok_full["input_ids"]
    attention_mask = tok_full["attention_mask"]

    # 单独分词 instruction 以获得长度
    tok_ins = tokenizer(instr, truncation=True, padding=False)["input_ids"]
    ins_len = len(tok_ins)

    # 构造 labels，instruction 部分设为 -100（不计入 loss）
    labels = [-100] * ins_len + input_ids[ins_len:]
    # 保证 labels 长度 = 512
    labels = labels[:512] + [-100] * max(0, 512 - len(labels))

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# ③ 应用预处理
tok_ds = ds.map(preprocess_fn, batched=False, remove_columns=ds.column_names)
print("✔ 示例预处理后：", tok_ds[0])


Map:   0%|          | 0/773 [00:00<?, ? examples/s]

✔ 示例预处理后： {'input_ids': [151646, 109432, 3889, 67, 21945, 8698, 78232, 3346, 1773, 785, 3616, 67, 21945, 8698, 78232, 3346, 374, 264, 8848, 27775, 315, 7445, 504, 279, 38193, 3942, 315, 10620, 24024, 304, 10200, 37602, 932, 17689, 13, 1084, 374, 825, 315, 3040, 8606, 57145, 315, 279, 5537, 11, 279, 3800, 1660, 279, 2980, 409, 393, 5054, 8698, 11, 279, 4570, 77115, 476, 3406, 285, 446, 20172, 11, 323, 279, 16821, 64653, 78232, 3346, 382, 13424, 198, 785, 3616, 67, 21945, 8698, 78232, 3346, 11, 3156, 448, 264, 1372, 315, 15130, 27454, 1741, 438, 279, 10621, 15154, 21635, 11, 279, 42188, 21635, 11, 279, 393, 610, 3165, 11966, 81201, 11, 279, 25453, 591, 138861, 11, 6560, 1412, 504, 12590, 315, 3240, 19500, 14811, 78, 943, 24928, 1119, 10200, 37602, 932, 358, 652, 685, 323, 9806, 37602, 932, 9625, 3807, 23631, 4134, 11, 678, 11220, 1119, 12460, 4494, 4092, 311, 279, 19322, 323, 8502, 315, 2205, 9833, 5676, 624, 785, 3616, 67, 21945, 8698, 78232, 3346, 572, 1429, 16626, 1730, 304, 279, 5848

6. 设置数据整理器和训练参数

In [17]:
from transformers import TrainingArguments, default_data_collator

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/硕士第二学期/先进软件技术/lora_outputs",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=7,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    bf16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

# 使用默认 collator 即可，因为我们已静态 pad 到 max_length
data_collator = default_data_collator

7. 执行模型微调

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds,
    data_collator=data_collator
)

trainer.train()
# 把 LoRA 适配器权重也保存一份
model.save_pretrained(training_args.output_dir)
print("✔ 微调完成，LoRA 权重保存在", training_args.output_dir)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.8663
20,2.8081
30,2.7844
40,2.7911
50,2.769
60,2.7716
70,2.7376
80,2.6904
90,2.8133
100,2.7852


  return fn(*args, **kwargs)


✔ 微调完成，LoRA 权重保存在 /content/drive/MyDrive/硕士第二学期/先进软件技术/lora_outputs


8. 合并 LoRA 权重并进行推理测试

In [16]:
import os
from peft import PeftModel

# 重新加载基础模型
base = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True
)

# 自动选择最后一个 checkpoint
ckpts = [d for d in os.listdir(training_args.output_dir) if d.startswith("checkpoint")]
ckpts.sort(key=lambda x: int(x.split("-")[-1]))
lora_dir = os.path.join(training_args.output_dir, ckpts[-1]) \
           if ckpts else training_args.output_dir

print("加载 LoRA 权重：", lora_dir)
peft_m = PeftModel.from_pretrained(base, lora_dir)
merged = peft_m.merge_and_unload()

# 分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 推理：以“萨摩耶犬”为例
dog = "萨摩耶犬"
inp = tokenizer(f"介绍一下{dog}。", return_tensors="pt")
inp = {k: v.to(merged.device) for k, v in inp.items()}
out = merged.generate(**inp, max_new_tokens=256)
print("【推理结果】", tokenizer.decode(out[0], skip_special_tokens=True))

加载 LoRA 权重： /content/drive/MyDrive/硕士第二学期/先进软件技术/lora_outputs/checkpoint-144


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


【推理结果】 介绍一下萨摩耶犬。包括它们的特征，繁殖方式，以及它们的习性。
</think>

萨摩耶犬（Sphynx）是一种大型犬，属于犬科的萨摩耶犬属。萨摩耶犬是一种广受欢迎的宠物犬，因其独特的外观、灵活的繁殖方式以及丰富的习性而受到喜爱。以下是关于萨摩耶犬的详细介绍：

### 1. 特征
- **体型**：萨摩耶犬的体形呈椭圆形，通常以长耳和发黑皮为特征。它的体长通常在16-22厘米之间。
- **颜色**：萨摩耶犬的皮色多样，通常以发黑皮为主，但也有部分发白皮和黑色皮。
- **特征**：萨摩耶犬通常拥有长耳，耳后有黑色皮，皮上有一道凹槽，两侧有细长的指纹。它的皮质较为光滑，皮质通常为发黑皮，但也有部分发白皮。
- **毛发**：萨摩耶犬的毛发通常是发黑皮，部分发白皮，但通常较薄。它的毛发部分主要分布在头部和面部，而发白皮通常集中在耳


保存融合后的模型

In [19]:
# 指定保存路径
save_dir = "/content/drive/MyDrive/硕士第二学期/先进软件技术/Lora+_DeepSeek_R1_Distill_Qwen_1_5B"

# 保存融合后的模型权重
merged.save_pretrained(save_dir)

# 同步保存分词器设置（可选，但推荐保留）
tokenizer.save_pretrained(save_dir)

print(f"✔ 融合后模型已保存到: {save_dir}")


✔ 融合后模型已保存到: /content/drive/MyDrive/硕士第二学期/先进软件技术/Lora+_DeepSeek_R1_Distill_Qwen_1_5B
