In [2]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM
from peft import LoraConfig, get_peft_model

# ==========================================================
# 1. 加载模型
# ==========================================================
model_name = "Qwen/Qwen2.5-0.5B"


# 设置模型保存路径
llama_model_path = "/root/autodl-tmp/model/Qwen2.5-0.5B"
 
# 如果路径不存在，创建路径
os.makedirs(llama_model_path, exist_ok=True)
 
# 下载并保存 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(llama_model_path)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.save_pretrained(llama_model_path)


# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)


# ==========================================================
# 2. LoRA 配置
# ==========================================================
lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q_proj","v_proj"],
    lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
print("模型参数量（含LoRA）:", model.get_memory_footprint())
# 模型参数量（含LoRA）: 1978293888
# 模型参数量（含LoRA）: 990228352

模型参数量（含LoRA）: 1978293888


In [6]:
from transformers import DataCollatorForLanguageModeling

# ==========================================================
# 3. 加载数据集（JSONL）
# ==========================================================
dataset = load_dataset("json", data_files={"train": "/root/autodl-tmp/data/train.jsonl", "test": "/root/autodl-tmp/data/test.jsonl"})

# ==========================================================
# 4. 数据预处理函数
# ==========================================================
def format_example(example):
    # 拼接 instruction + input + output 为单一文本（SFT格式）
    text = f"Instruction: {example['instruction']}\nInput: {example['input']}\nAnswer: {example['output']}"
    return {"text": text}

dataset = dataset.map(format_example)

def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=1024,
        padding="max_length"
    )

tokenized_datasets = dataset.map(tokenize_fn, batched=True, remove_columns=dataset["train"].column_names)

# ==========================================================
# 5. 数据整理器
# ==========================================================
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2919 [00:00<?, ? examples/s]

Map:   0%|          | 0/11675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2919 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
# 6. 训练参数
# ==========================================================
training_args = TrainingArguments(
    output_dir="/root/autodl-tmp/rag_lora",
    per_device_train_batch_size=1,
    learning_rate=5e-4,
    gradient_accumulation_steps=4,  # 累积梯度模拟更大 batch
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=50,
    optim="adamw_torch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
50,0.4721
100,0.3229
150,0.3107
200,0.3113
250,0.3101
300,0.3032
350,0.3056
400,0.3005
450,0.3009
500,0.2988


TrainOutput(global_step=8757, training_loss=0.2863214177683471, metrics={'train_runtime': 3247.1485, 'train_samples_per_second': 10.786, 'train_steps_per_second': 2.697, 'total_flos': 7.71337323085824e+16, 'train_loss': 0.2863214177683471, 'epoch': 3.0})

In [8]:
# ==========================================================
# 6. 保存LoRA权重
# ==========================================================
model.save_pretrained(f"/root/autodl-tmp/lora_adapter")
tokenizer.save_pretrained(f"/root/autodl-tmp/lora_adapter")

print("✅ LoRA fine-tuning finished. Adapters saved to:", f"/root/autodl-tmp/lora_adapter")

✅ LoRA fine-tuning finished. Adapters saved to: /root/autodl-tmp/lora_adapter
