<a href="https://colab.research.google.com/github/CloudyBuaaer/transformer/blob/main/%E4%BD%BF%E7%94%A8LLaMA_Factory%E5%BE%AE%E8%B0%83Llama3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用 LLaMA Factory 微调 Llama-3 中文对话模型

请申请一个免费 T4 GPU 来运行该脚本

项目主页: https://github.com/hiyouga/LLaMA-Factory


## 安装 LLaMA Factory 依赖

In [1]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install -e .[torch,bitsandbytes]

/content
Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 629, done.[K
remote: Counting objects: 100% (629/629), done.[K
remote: Compressing objects: 100% (468/468), done.[K
remote: Total 629 (delta 154), reused 400 (delta 103), pack-reused 0 (from 0)[K
Receiving objects: 100% (629/629), 5.25 MiB | 18.29 MiB/s, done.
Resolving deltas: 100% (154/154), done.
/content/LLaMA-Factory
[0m[01;34massets[0m/       [01;34mdocker[0m/    LICENSE      pyproject.toml  [01;34mrequirements[0m/  [01;34mtests[0m/
CITATION.cff  [01;34mdocs[0m/      Makefile     README.md       [01;34mscripts[0m/       [01;34mtests_v1[0m/
[01;34mdata[0m/         [01;34mexamples[0m/  MANIFEST.in  README_zh.md    [01;34msrc[0m/
Obtaining file:///content/LLaMA-Factory
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ..

In [11]:
!pip install -U bitsandbytes>=0.46.1

### 检查 GPU 环境

免费 T4 申请教程：https://zhuanlan.zhihu.com/p/642542618

In [2]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("需要 GPU 环境，申请教程：https://zhuanlan.zhihu.com/p/642542618")

## 更新自我认知数据集

可以自由修改 NAME 和 AUTHOR 变量的内容。

In [3]:
import json

%cd /content/LLaMA-Factory/

NAME = "Llama-Chinese"
AUTHOR = "LLaMA Factory"

with open("data/identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

for sample in dataset:
  sample["output"] = sample["output"].replace("{{"+ "name" + "}}", NAME).replace("{{"+ "author" + "}}", AUTHOR)

with open("data/identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)

/content/LLaMA-Factory


## 使用 LLaMA Board Web UI 微调模型

In [None]:
%cd /content/LLaMA-Factory/
!GRADIO_SHARE=1 llamafactory-cli webui

## 使用命令行微调模型

微调过程大约需要 30 分钟。

In [None]:
import json

args = dict(
  stage="sft",                                               # 进行指令监督微调
  do_train=True,
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # 使用 4 比特量化版 Llama-3-8b-Instruct 模型
  dataset="identity,alpaca_en_demo,alpaca_zh_demo",          # 使用 alpaca 和自我认知数据集
  template="llama3",                                         # 使用 llama3 提示词模板
  finetuning_type="lora",                                    # 使用 LoRA 适配器来节省显存
  lora_target="all",                                         # 添加 LoRA 适配器至全部线性层
  output_dir="llama3_lora",                                  # 保存 LoRA 适配器的路径
  per_device_train_batch_size=2,                             # 批处理大小
  gradient_accumulation_steps=4,                             # 梯度累积步数
  lr_scheduler_type="cosine",                                # 使用余弦学习率退火算法
  logging_steps=5,                                           # 每 5 步输出一个记录
  warmup_ratio=0.1,                                          # 使用预热学习率
  save_steps=1000,                                           # 每 1000 步保存一个检查点
  learning_rate=5e-5,                                        # 学习率大小
  num_train_epochs=3.0,                                      # 训练轮数
  max_samples=300,                                           # 使用每个数据集中的 300 条样本
  max_grad_norm=1.0,                                         # 将梯度范数裁剪至 1.0
  loraplus_lr_ratio=16.0,                                    # 使用 LoRA+ 算法并设置 lambda=16.0
  fp16=True,                                                 # 使用 float16 混合精度训练
  report_to="none",                                          # 关闭 wandb 记录器
)

json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3.json

In [12]:
%cd /content/LLaMA-Factory/

!llamafactory-cli train \
    --model_name_or_path unsloth/llama-3-8b-Instruct-bnb-4bit \
    --stage sft \
    --do_train \
    --dataset identity,alpaca_en_demo,alpaca_zh_demo \
    --template llama3 \
    --finetuning_type lora \
    --lora_target all \
    --output_dir llama3_lora \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 5 \
    --warmup_ratio 0.1 \
    --save_steps 1000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --max_samples 300 \
    --max_grad_norm 1.0 \
    --loraplus_lr_ratio 16.0 \
    --fp16 \
    --report_to none

/content/LLaMA-Factory
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
[INFO|2026-02-14 10:04:17] llamafactory.hparams.parser:459 >> Process rank: 0, world size: 1, device: cuda:0, distributed training: False, compute dtype: torch.float16
[INFO|configuration_utils.py:667] 2026-02-14 10:04:18,052 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/config.json
[INFO|configuration_utils.py:739] 2026-02-14 10:04:18,054 >> Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",


## 模型推理

In [22]:
import sys
sys.path.append('/content/LLaMA-Factory/src')

from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc

%cd /content/LLaMA-Factory/

args = dict(
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # 使用 4 比特量化版 Llama-3-8b-Instruct 模型
  adapter_name_or_path="llama3_lora",                        # 加载之前保存的 LoRA 适配器
  template="llama3",                                         # 和训练保持一致
  finetuning_type="lora",                                    # 和训练保持一致
)
chat_model = ChatModel(args)

messages = []
print("使用 `clear` 清除对话历史，使用 `exit` 退出程序。")
while True:
  query = input("\nUser: ")
  if query.strip() == "exit":
    break
  if query.strip() == "clear":
    messages = []
    torch_gc()
    print("对话历史已清除")
    continue

  messages.append({"role": "user", "content": query})
  print("Assistant: ", end="", flush=True)

  response = ""
  for new_text in chat_model.stream_chat(messages):
    print(new_text, end="", flush=True)
    response += new_text
  print()
  messages.append({"role": "assistant", "content": response})

torch_gc()

/content/LLaMA-Factory


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[INFO|configuration_utils.py:667] 2026-02-14 10:59:29,603 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/config.json
[INFO|configuration_utils.py:739] 2026-02-14 10:59:29,608 >> Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "

[INFO|2026-02-14 10:59:36] llamafactory.data.template:144 >> Add <|eom_id|> to stop words.


[INFO|configuration_utils.py:667] 2026-02-14 10:59:36,451 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/config.json
[INFO|configuration_utils.py:739] 2026-02-14 10:59:36,453 >> Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128255,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quan

[INFO|2026-02-14 10:59:36] llamafactory.model.model_utils.quantization:144 >> Loading ?-bit BITSANDBYTES-quantized model.
[INFO|2026-02-14 10:59:36] llamafactory.model.model_utils.kv_cache:144 >> KV cache is enabled for faster generation.


[INFO|quantization_config.py:486] 2026-02-14 10:59:39,018 >> Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
[INFO|auto.py:249] 2026-02-14 10:59:39,019 >> 
[INFO|modeling_utils.py:732] 2026-02-14 10:59:44,052 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/model.safetensors
[INFO|modeling_utils.py:801] 2026-02-14 10:59:44,053 >> Will use dtype=torch.bfloat16 as defined in model's config object
[INFO|configuration_utils.py:1014] 2026-02-14 10:59:44,055 >> Generate config GenerationConfig {
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 128255,
  "use_cache": true
}

[INFO|accelerate.py:214] 2026-02-14 10:59:44,182 >> We will use 90% of the memory on device

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

[INFO|configuration_utils.py:967] 2026-02-14 10:59:56,380 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/generation_config.json
[INFO|configuration_utils.py:1014] 2026-02-14 10:59:56,381 >> Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128009
  ],
  "max_length": 8192,
  "pad_token_id": 128255,
  "temperature": 0.6,
  "top_p": 0.9
}



[INFO|2026-02-14 10:59:56] llamafactory.model.model_utils.attention:144 >> Using torch SDPA for faster training and inference.
[INFO|2026-02-14 10:59:57] llamafactory.model.adapter:144 >> Loaded adapter(s): llama3_lora
[INFO|2026-02-14 10:59:57] llamafactory.model.loader:144 >> all params: 8,051,232,768
使用 `clear` 清除对话历史，使用 `exit` 退出程序。

User: 你好，请介绍一下你自己
Assistant: 您好，我是 Llama-Chinese，一个由 LLaMA Factory 开发的 AI 助手，我可以回答各种问题，提供实用的建议和帮助。

User: What is your name?
Assistant: 您好，我是 LLaMA Factory 开发的 LLaMA-Chinese，一个 AI 助手，我可以回答各种问题，提供实用的建议和帮助。

User: 1+1等于多少？
Assistant: 1+1等于2。

User: 你是MeTa公司发明的LLaMa吗
Assistant: 不是，我是由 LLaMA Factory 开发的 LLaMA-Chinese。

User: 你能做什么？
Assistant: 我可以回答各种问题，提供实用的建议和帮助。我可以翻译文字、解释概念、提供事务性信息、帮助解决问题等。我努力成为一个有用的 AI 助手。

User: 写一首关于春天的诗
Assistant: 春意浓，花初绽，
绿意满园，春风惹，
鸟儿啼鸣，虫儿鸣，
春暖花开，美如画。

青山绿水，相互辉，
花香鸟语，四季和，
春意浓，心旷神怡，
春天的美，永远留存。

User: 我叫张三
Assistant: 您好，张三！我是 LLaMA-Chinese，很高兴与您相遇。请问我能为您做些什么？

User: 你还记得我的名字吗？
Assistant: 当然记得！您之前告诉我的名字是张三。

User: 我之前让你写了一首关于冬天的诗，对吗？
A

## 合并 LoRA 权重并上传模型

注意：Colab 免费版仅提供了 12GB 系统内存，而合并 8B 模型的 LoRA 权重需要至少 18GB 系统内存，因此你 **无法** 在免费版运行此功能。

In [23]:
!huggingface-cli login

/bin/bash: line 1: huggingface-cli: command not found


In [27]:
import json

args = dict(
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # 使用非量化的官方 Llama-3-8B-Instruct 模型
  adapter_name_or_path="llama3_lora",                       # 加载之前保存的 LoRA 适配器
  template="llama3",                                        # 和训练保持一致
  finetuning_type="lora",                                   # 和训练保持一致
  export_dir="llama3_lora_merged",                          # 合并后模型的保存目录
  export_size=2,                                            # 合并后模型每个权重文件的大小（单位：GB）
  export_device="cpu",                                      # 合并模型使用的设备：`cpu` 或 `auto`
  # export_hub_model_id="your_id/your_model",               # 用于上传模型的 HuggingFace 模型 ID
)

json.dump(args, open("merge_llama3.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli export merge_llama3.json

/content/LLaMA-Factory
Traceback (most recent call last):
  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/content/LLaMA-Factory/src/llamafactory/cli.py", line 24, in main
    launcher.launch()
  File "/content/LLaMA-Factory/src/llamafactory/launcher.py", line 152, in launch
    export_model()
  File "/content/LLaMA-Factory/src/llamafactory/train/tuner.py", line 129, in export_model
    model_args, data_args, finetuning_args, _ = get_infer_args(args)
                                                ^^^^^^^^^^^^^^^^^^^^
  File "/content/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 471, in get_infer_args
    model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)
                                                              ^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 225, in _parse_infer_args
    return _parse_args(parser, args, allow_ext

In [30]:
%cd /content/LLaMA-Factory/

# 直接使用命令行参数，不用配置文件
!llamafactory-cli export \
    --model_name_or_path unsloth/llama-3-8b-Instruct-bnb-4bit \
    --adapter_name_or_path llama3_lora \
    --template llama3 \
    --finetuning_type lora \
    --export_dir llama3_lora_merged \
    --export_size 2 \
    --export_device cpu \
    --stage sft

/content/LLaMA-Factory
[INFO|configuration_utils.py:667] 2026-02-14 11:14:44,508 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/fd5a4dc328319c1cfe9489eccfb9c6406bdfd469/config.json
[INFO|configuration_utils.py:739] 2026-02-14 11:14:44,510 >> Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128255,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat