In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTConfig, SFTTrainer

In [2]:
model_path = "./Qwen3-4B-Instruct-2507"

dataset_path = "./data/input/sft_dataset_4000.json"

# 训练后 LoRA 适配器的临时保存路径
lora_adapter_path = "./qwen3-4b-sft-lora-adapter-8bit"
# 最终合并后完整模型的保存路径
merged_model_path = "./qwen3-4b-sft-merged-final"

In [3]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8 else torch.float16

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# dataset = load_dataset("json", data_files=dataset_path, split="train")
#
# def format_prompt(example):
#     messages = [
#         {"role": "system", "content": "You are a helpful assistant specialized in cybersecurity and the MITRE ATT&CK framework."},
#         {"role": "user", "content": f"{example['instruction']}\n\n{example['input']}"},
#         {"role": "assistant", "content": example['output']}
#     ]
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
#     return {"text": prompt}
#
# formatted_dataset = dataset.map(format_prompt)
#
# print("数据集格式化完成。示例如下：")
# print(formatted_dataset[0]['text'])

In [5]:

dataset = load_dataset("json", data_files=str(dataset_path), split="train")

def format_prompt_function(example):

    return {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant specialized in cybersecurity and the MITRE ATT&CK framework."},
            {"role": "user", "content": f"{example['instruction']}\n\n{example['input']}"},
            {"role": "assistant", "content": example['output']}
        ]
    }


formatted_dataset = dataset.map(format_prompt_function, remove_columns=list(dataset.features))

print("\n示例如下：")
print(formatted_dataset[0]['messages'])


示例如下：
[{'content': 'You are a helpful assistant specialized in cybersecurity and the MITRE ATT&CK framework.', 'role': 'system'}, {'content': 'Find the techniques and ID from MITRE ATT&CK framework.\n\nTrickBot has used macros in Excel documents to download and deploy the malware on the user’s machine.', 'role': 'user'}, {'content': 'T1059: Command and Scripting Interpreter', 'role': 'assistant'}]


In [6]:
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

training_args = SFTConfig(
    dataset_text_field="text",
    max_length=1024,
    packing=True,                    # 启用 packing？
    #assistant_only_loss=True,        # 只在 assistant 的回复上计算损失    似乎qwen3不兼容


    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    output_dir="./results_8bit_new",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_8bit",
    save_steps=500,
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    bf16=True if compute_dtype == torch.bfloat16 else False,
    fp16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
)


Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to 'flash_attention_2' or 'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash Attention is the only known attention mechanisms that reliably support this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or `attn_implementation='kernels-community/vllm-flash

In [7]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,1.8469


KeyboardInterrupt: 

In [None]:
trainer.save_model(lora_adapter_path)

In [None]:
# 释放 GPU 显存
del model
del trainer
torch.cuda.empty_cache()

# 1. 以 16-bit 精度重新加载基础模型
print(f"\n正在以 {compute_dtype} 精度重新加载基础模型...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=compute_dtype,
    device_map="auto",
    trust_remote_code=True,
)

# 2. 加载 LoRA 适配器
print(f"正在从 '{lora_adapter_path}' 加载 LoRA 适配器...")
model_with_lora = PeftModel.from_pretrained(base_model, lora_adapter_path)

# 3. 合并权重
print("正在合并 LoRA 权重...")
merged_model = model_with_lora.merge_and_unload()
print("权重合并完成。")

# 4. 保存完整模型和分词器
print(f"正在将合并后的完整模型保存到 '{merged_model_path}'...")
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

print("\n🎉 恭喜！完整的微调后模型已成功保存！")
print(f"您现在可以在 '{merged_model_path}' 目录下找到一个可以直接使用的模型。")