# 模型微调流程

## 导入模型

In [None]:
from unsloth import FastLanguageModel
import torch
import os
from datasets import load_dataset
from config import BASE_MODEL_PATH, NON_REASON_DATASET_PATH, REASON_DATASET_PATH, LORA_SAVE_PATH, MERGED_MODEL_PATH, COMBINED_DATASET_PATH
#初始释放显存
torch.cuda.empty_cache()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
max_seq_length = 8192
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_PATH,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 3070 Ti Laptop GPU. Num GPUs = 1. Max memory: 8.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.29s/it]


## 导入数据集
此处应为数据集处理脚本

### 无思考数据集在此处

### AIpacaHandler
此为AIpaca格式数据集处理方法
自动合并多个无思考数据集

In [3]:
import json
from datasets import load_dataset
from config import BASE_MODEL_PATH, NON_REASON_DATASET_PATH, REASON_DATASET_PATH, LORA_SAVE_PATH, MERGED_MODEL_PATH
all_conversations = []
for path in NON_REASON_DATASET_PATH:
    ds = load_dataset(data_files=path, split="train", path="json")

    for sample in ds:
        user_msg = {
            "role": "user",
            "content": sample["instruction"] + (("\n" + sample["input"]) if sample.get("input") else "")
        }
        assistant_msg = {
            "role": "assistant",
            "content": sample["output"]
        }
        all_conversations.append([user_msg, assistant_msg])

non_reasoning_dataset = {"conversations": all_conversations}
print(f"共加载无思考对话数: {len(non_reasoning_dataset['conversations'])}")

共加载无思考对话数: 4337


In [4]:
non_reasoning_dataset['conversations'][2000]

[{'role': 'user', 'content': '咽喉痉挛是什么'},
 {'role': 'assistant',
  'content': '咽喉痉挛是咽喉部的肌肉痉挛，正确的名称应该是咽肌痉挛和喉痉挛。咽肌痉挛就是软腭和咽肌发生规律的或不规律的收缩运动，甚者每分钟可达60～100次以上，与脉搏、呼吸无关。\n\n单纯的咽肌痉挛，大多原因不明。慢性咽炎病人、烟酒过度者、鼻分泌物长期刺激咽部及外部物理化学因素的影响均有可能导致咽肌痉挛的发生。咽肌的阵发性强直性痉挛较少见，癌肿的疼痛可引起，狂犬病、破伤风和脑膜炎以及颅内疾患皆可能发生咽肌强直性阵挛。阵挛发作时，病人及旁人常可听到明显的肌肉收缩声。病人自诉可听见自己有耳鸣声，即所谓他觉性耳鸣；耳鸣声与脉搏不一致，压迫颈动脉时不消失，故为肌性他觉性耳鸣，病人常有自听过显之感，常有吞咽障碍，咽喉不适，反复作呕和局部痛感，常因精神恐惧和紧张而发作。\n\n喉痉挛分为成人喉痉挛，喉晕厥和蝉鸣性喉痉挛（小儿）。'}]

In [5]:
non_reasoning_conversations = tokenizer.apply_chat_template(
    non_reasoning_dataset["conversations"],
    tokenize = False,
)

### 有思考数据集在此处

### AIpacaHandler
此为AIpaca处理方法（有思考数据集）

In [6]:
reasoning_dataset = load_dataset(data_files=REASON_DATASET_PATH, split = "train", path="json")

In [7]:
reasoning_dataset[0]

{'instruction': 'Please answer the following multiple-choice question:\nA factory worker presents with excessive salivation, blue lines on gums, tremors, disturbed personality, insomnia, and loss of appetite. The most likely poisoning is -?\nA. Mercury\nB. Lead\nC. Arsenic\nD. Phosphorus',
 'input': '',
 'output': "<think>\nThe patient is a factory worker presenting with excessive salivation, blue lines on the gums, tremors, disturbed personality, insomnia, and loss of appetite. These symptoms collectively suggest a form of heavy metal poisoning. To determine the most likely poisoning, it is essential to meticulously analyze each symptom in the context of the provided options: Mercury, Lead, Arsenic, and Phosphorus.\n\nExcessive salivation, also known as ptyalism, is a significant symptom to consider. This symptom is notably associated with mercury poisoning, particularly with elemental mercury exposure, which can disrupt autonomic functions leading to increased salivation. Tremors, es

In [8]:
import json

dataset = {"conversations": []}

for sample in reasoning_dataset:
    user_msg = {
        "role": "user",
        "content": sample["instruction"] + (("\n" + sample["input"]) if sample.get("input") else "")
    }
    assistant_msg = {
        "role": "assistant",
        "content": sample["output"]
    }
    dataset["conversations"].append([user_msg, assistant_msg])

print(type(dataset), len(dataset["conversations"]))
print(dataset["conversations"][1])
reasoning_dataset = dataset

<class 'dict'> 3000
[{'role': 'user', 'content': "Please answer the following multiple-choice question:\nRett's syndrome occurs due to deficiency of ?\nA. Niacin\nB. Biotin\nC. Carotene\nD. Vit D"}, {'role': 'assistant', 'content': "<think>\nRett's syndrome is a severe neurodevelopmental disorder that predominantly affects females and is characterized by a regression in cognitive and motor skills after a period of apparently normal development. It typically manifests in early childhood, with symptoms becoming noticeable between the ages of 6 to 18 months. The condition involves a range of neurological symptoms, including loss of purposeful hand movements, slowed growth, seizures, and difficulties with communication and social interaction. Importantly, Rett's syndrome is a genetic disorder caused by mutations in the MECP2 gene, which is located on the X chromosome. This gene plays a critical role in brain development and function by regulating the expression of other genes essential for

In [9]:
reasoning_conversations = tokenizer.apply_chat_template(
    reasoning_dataset["conversations"],
    tokenize = False,
)

### 数据集合并
有思考数据为reasoning_dataset  
无思考数据为non_reasoning_dataset  
现将二者合并为combined_dataset  

In [10]:
print(len(reasoning_conversations))
print(len(non_reasoning_conversations))

3000
4337


根据有思考和无思考数据比例合并

In [11]:
#这里就是调整比例
chat_percentage = 0.75
import pandas as pd
reasoning_subset = pd.Series(reasoning_conversations)
# reasoning_subset = reasoning_subset.sample(
#     int(len(non_reasoning_conversations) / (1.0 - chat_percentage)),
#     random_state = 2407,
# )

data = pd.concat([
    pd.Series(non_reasoning_conversations),
    pd.Series(reasoning_subset)
])
data.name = "text"

from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)

In [27]:
len(combined_dataset)

7337

In [28]:
combined_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 7337
})

可以将合并后的数据集导出，下次直接用此导入后训练

In [None]:
combined_dataset.to_json(COMBINED_DATASET_PATH, force_ascii=False)

Creating json from Arrow format: 100%|██████████| 8/8 [00:01<00:00,  5.54ba/s]


37263409

直接导入完全处理好的数据集

In [None]:
from datasets import load_dataset

# 直接用load_dataset加载json文件，得到的就是Dataset对象
combined_dataset = load_dataset("json", data_files=COMBINED_DATASET_PATH, split="train")
print(type(combined_dataset))
print(len(combined_dataset))
combined_dataset[0]

<class 'datasets.arrow_dataset.Dataset'>
7337


{'text': '<|im_start|>user\n罗格列酮片的执行标准<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n国家食品药品监督管理局国家药品标准WS1-XG-019-2014。<|im_end|>\n',
 '__index_level_0__': 3177}

## 模型训练

In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_num_proc = 1,
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = None, # Use this for WandB etc
    ),
)

Unsloth 2025.6.5 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.
average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
Unsloth: Tokenizing ["text"]: 100%|██████████| 7337/7337 [00:04<00:00, 1515.58 examples/s]


In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,337 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288/4,000,000,000 (1.65% trained)


Step,Training Loss
1,1.5268
2,1.5841
3,1.5839
4,1.3631
5,1.2288
6,1.1844
7,1.0633
8,1.2087
9,1.162
10,1.1539


## 模型保存
注意修改路径，保存为符合HF规范的文件夹

In [17]:
# 保存 LoRA adapter 权重
model.save_pretrained(LORA_SAVE_PATH)

# 保存 tokenizer（可以跳过，如果没有改动过）
tokenizer.save_pretrained(LORA_SAVE_PATH)

('C:/Users/23756/demo/Qwen3-4b-v0.3\\tokenizer_config.json',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\special_tokens_map.json',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\chat_template.jinja',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\vocab.json',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\merges.txt',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\added_tokens.json',
 'C:/Users/23756/demo/Qwen3-4b-v0.3\\tokenizer.json')

In [18]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer


base_model_path = BASE_MODEL_PATH
lora_model_path = LORA_SAVE_PATH  # 你训练后保存的路径
merged_model_path = MERGED_MODEL_PATH  # 合并后保存路径
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,       # 根据实际显存可选 float16/float32
    low_cpu_mem_usage=True
).to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# 2. 从 LoRA adapter 加载增量权重
print("Loading LoRA adapter...")
peft_model = PeftModel.from_pretrained(
    base_model,
    lora_model_path,
    torch_dtype=torch.float16
)

# 3. 合并 LoRA 到基础模型
print("Merging LoRA weights into the base model...")
merged_model = peft_model.merge_and_unload()

# 4. 保存合并后的模型和分词器
print(f"Saving merged model to {merged_model_path} ...")
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.30s/it]


Loading LoRA adapter...
Merging LoRA weights into the base model...
Saving merged model to C:/Users/23756/demo/merged-Qwen3-4b-v0.3 ...


('C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\tokenizer_config.json',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\special_tokens_map.json',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\chat_template.jinja',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\vocab.json',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\merges.txt',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\added_tokens.json',
 'C:/Users/23756/demo/merged-Qwen3-4b-v0.3\\tokenizer.json')

## 模型量化
将HF格式的模型转换为GGUF格式的模型，便于vllm或ollama部署

执行命令将合并后HF模型转换为bin文件  
cd llama.cpp  
python convert_hf_to_gguf.py 你的合并模型路径 --outtype f16 --outfile bin文件路径  
  
再将bin文件量化为gguf文件(参数自选)  
f16相当于没量化，如果后续基模型用未量化的则可在服务器微调后选则q4_k等量化后在主机部署  
llama-quantize 输入bin文件 输出gguf文件 量化类型  
  
在根目录下输入以下指令以将你的模型导入ollama中  
ollama create 你的模型名称

## 对HF格式的模型进行简单测试

In [8]:
import requests
url = "http://localhost:11434/api/generate"
system_prompt = (
    "你是一个AI助手"
)
user_input = (
    "你好，你有思维链吗"
)
payload = {
    "model": "mymodel_v0.3:latest",  # 或者你在 Ollama 中导入时设置的模型名
    "system": system_prompt,
    "prompt": user_input,
    "stream": False,
    "thinking": True,  # 如果不需要思考过程，可以设置为 False
}
# 发送请求
response = requests.post(url, json=payload)
print(response.json())
text = response.json()["response"]
print(text)

{'model': 'mymodel_v0.3:latest', 'created_at': '2025-06-28T15:11:12.2114899Z', 'response': '<think>\n\n</think>\n\n我理解您对我的能力感兴趣。作为一款大型语言模型，我可以进行深度推理和复杂的逻辑分析，但我不具备自主的意识或情感。我的功能主要基于算法和数据分析，能够理解和生成自然语言文本、回答问题、创作内容等。\n\n如果您有任何具体的问题或需要帮助的地方，请随时告诉我！', 'done': True, 'done_reason': 'stop', 'context': [151644, 8948, 319, 56568, 101909, 15469, 110498, 151645, 319, 151644, 872, 319, 108386, 3837, 56568, 18830, 102141, 63314, 101037, 151645, 319, 151644, 77091, 319, 151667, 271, 151668, 271, 35946, 101128, 87026, 32664, 97611, 99788, 103198, 1773, 100622, 104794, 101951, 102064, 104949, 3837, 109944, 71817, 102217, 113272, 33108, 106888, 104913, 101042, 3837, 77288, 101553, 102094, 100842, 9370, 100708, 57191, 104934, 1773, 97611, 98380, 99558, 104210, 107018, 33108, 111540, 3837, 100006, 115167, 43959, 99795, 102064, 108704, 5373, 102104, 86119, 5373, 104223, 43815, 49567, 3407, 106870, 110117, 100398, 103936, 57191, 85106, 100364, 103958, 37945, 102422, 106525, 6313], 'total_duration': 638641410