<a href="https://colab.research.google.com/github/Chaos-woo/qwen3-ft-template/blob/main/%E5%BE%AE%E8%B0%83Qwen3_0_6b%E5%AE%9E%E6%88%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

环境安装

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install transformers>=4.33.0
    !pip install bitsandbytes accelerate xformers==0.0.29.post3 peft==0.14.0 trl==0.15.2 triton cut_cross_entropy
    !pip install sentencepiece protobuf==3.20.3 datasets huggingface_hub hf_transfer
    !pip install unsloth_zoo>=2025.3.7
    !pip install --upgrade --no-cache-dir unsloth

引入Unsloth框架，导入模型和分词器

In [None]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-0.6B",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

设置模型参数

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

从Hugging Face拉取数据集

In [None]:
from datasets import load_dataset
non_reasoning_dataset = load_dataset("ch1so/ruozhiba_qa", split = "train[:100]") #取前100数据

查看数据集结构

In [None]:
non_reasoning_dataset

查看数据集第一行数据的结构

In [None]:
non_reasoning_dataset[0]

将数据集修改为ShareGPT格式

In [None]:
from datasets import Dataset
from unsloth.chat_templates import standardize_sharegpt

# 将原始JSON转换为对话格式列表，便于后续模板化
conversations = []
for item in non_reasoning_dataset:
    conversations.append([
        {"role": "user", "content": item["instruction"]},
        {"role": "assistant", "content": item["output"]},
    ])
# 将list转成Dataset
raw_conversations_ds = Dataset.from_dict({"conversations": conversations})
dataset = standardize_sharegpt(raw_conversations_ds)

non_reasoning_conversations = tokenizer.apply_chat_template(
    dataset["conversations"],
    tokenize = False,
)

查看转换后的数据结构

In [None]:
non_reasoning_conversations[0]

将数据进行合并或混合，可以加载多个目标任务不相同的数据集，然后将数据集按照比例混合，进行混合训练，可以做到多种目标任务同时进行，并且因为是有比例的混合，减少先后训练导致的后训练任务过拟合，这里仅作为测试，所有仅使用一种数据集转换为混合数据集

加载COT数据集的1种方法
```
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
def generate_conversation(examples):
    problems  = examples["problem"]
    solutions = examples["generated_solution"]
    conversations = []
    for problem, solution in zip(problems, solutions):
        conversations.append([
            {"role" : "user",      "content" : problem},
            {"role" : "assistant", "content" : solution},
        ])
    return { "conversations": conversations, }
reasoning_conversations = tokenizer.apply_chat_template(
    reasoning_dataset.map(generate_conversation, batched = True)["conversations"],
    tokenize = False,
)
```
混合两种数据集的1种方法
```
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
    int(len(reasoning_conversations) * (1.0 - chat_percentage)),
    random_state = 2407,
)
data = pd.concat([
    pd.Series(reasoning_conversations),
    pd.Series(non_reasoning_subset)
])
data.name = "text"

from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)
```

In [None]:
import pandas as pd
from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(pd.Series(non_reasoning_conversations)))
combined_dataset = combined_dataset.shuffle(seed = 3407)

配置训练配置

In [None]:
from trl import SFTTrainer, SFTConfig

def formatting_func(example):
    # text = (f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
    #        f"{example['question']}<|eot_id|><|start_header_id|>"
    #        f"assistant<|end_header_id|>\n\n{example['answer']}<|eot_id|>")
    text = example['0']
    if isinstance(text, list):  # 如果 text 是列表
        return text  # 直接返回列表
    elif isinstance(text, str):  # 如果 text 是字符串
        return [text]  # 返回包含单个字符串的列表
    else:
        raise ValueError(f"Unexpected type for example['0']: {type(text)}")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    formatting_func = formatting_func,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

查看下当前的资源

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

开始训练

In [None]:
trainer_stats = trainer.train()

训练完成后再查看一下资源

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

模型对话效果测试

In [None]:
messages = [
    {"role" : "user", "content" : "爸爸再婚，我是不是就有了个新娘？"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

推送到Hugging Face

In [None]:
#model.save_pretrained("lora_model")  # Local saving
#tokenizer.save_pretrained("lora_model")
model.push_to_hub("ch1so/qwen3-0.6b-ruozhiba-100", token = "hfxxx") # Online saving
tokenizer.push_to_hub("ch1so/qwen3-0.6b-ruozhiba-100", token = "hfxxx") # Online saving

加载LoRA适配器，把我们刚才微调好的模型加载到环境中

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "ch1so/qwen3-0.6b-ruozhiba-100", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

将我们微调好的模型量化为gguf格式，再推送到Hugging Face上

In [None]:
# Save to 8bit Q8_0
if False:
    model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: # Pushing to HF Hub
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: # Pushing to HF Hub
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "ch1so/qwen3-0.6b-ruozhiba-100-gguf", # Change hf to your username!
        tokenizer,
        quantization_method = ["f16", "q4_k_m", "q8_0", "q5_k_m"],
        token = "hfXXX", # Get a token at https://huggingface.co/settings/tokens
    )