In [1]:
import json
import os

import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
import swanlab


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /home/lick/tools/anaconda3/envs/TCMLLM/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
# ----------------- 数据集转换 -----------------
def dataset_jsonl_transfer(origin_path: str, new_path: str):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    原始每行数据格式:
    {
        "question": "...",
        "think": "...",
        "answer": "..."
    }
    转换后每行:
    {
        "instruction": PROMPT,
        "input": question,
        "output": "<think>...</think>\\n..."
    }
    """
    messages = []

    # 读取旧的 JSONL 文件
    with open(origin_path, "r", encoding="utf-8") as file:
        for line in file:
            if not line.strip():
                continue
            data = json.loads(line)

            user_question = data["question"]
            # 修复字符串嵌套引号问题
            output = f"<think>{data['think']}</think>\n{data['answer']}"

            message = {
                "instruction": PROMPT,
                "input": user_question,
                "output": output,
            }
            messages.append(message)

    # 保存重构后的 JSONL 文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")


# ----------------- 预处理函数 -----------------
def process_func(example):
    """
    将数据集进行预处理：
    - 构造 system + user + assistant 的 prompt
    - 拼接 input_ids / attention_mask / labels
    """
    input_ids, attention_mask, labels = [], [], []

    # chat 模板：system + user + assistant
    instruction = tokenizer(
        f"<|im_start|>system\n{PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(example["output"], add_special_tokens=False)

    input_ids = instruction["input_ids"] + response["input_ids"] + [
        tokenizer.pad_token_id
    ]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    # 只训练 assistant 部分的 token
    labels = (
        [-100] * len(instruction["input_ids"])
        + response["input_ids"]
        + [tokenizer.pad_token_id]
    )

    # 长度截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


# ----------------- 推理函数 -----------------
def predict(messages, model, tokenizer):
    """
    使用 chat_template 做推理
    messages 示例:
    [
        {"role": "system", "content": PROMPT},
        {"role": "user", "content": "..." }
    ]
    """
    # 使用模型的 device，避免 device_map="auto" 时冲突
    device = model.device

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=MAX_LENGTH,
    )

    # 只保留新生成的部分
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(
            model_inputs.input_ids, generated_ids
        )
    ]
    response = tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True
    )[0]
    return response



In [3]:
# ----------------- 基本配置 -----------------
os.environ["SWANLAB_PROJECT"] = "qwen3-sft-medical"
os.environ["SWANLAB_API"] = "Qwen/Qwen3-1.7B"

PROMPT = "你是一个医学专家，你需要根据用户的问题，给出带有思考的回答。"
MAX_LENGTH = 2048

swanlab.config.update(
    {
        "model": "Qwen/Qwen3-1.7B",
        "prompt": PROMPT,
        "data_max_length": MAX_LENGTH,
    }
)

In [4]:
# ----------------- 模型加载 -----------------
# 在 modelscope 上下载 Qwen 模型到本地目录
model_dir = snapshot_download(
    "Qwen/Qwen3-1.7B",
    cache_dir="./../BaseModels/",
    revision="master",
)



Downloading Model to directory: ./../BaseModels/Qwen/Qwen3-1.7B


2025-12-01 16:39:37,063 - modelscope - INFO - Target directory already exists, skipping creation.


In [5]:
# Transformers 加载 tokenizer 和模型权重
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    use_fast=False,
    trust_remote_code=True,
)

# 如果没有 pad_token，则设置为 eos_token，避免 padding 报错
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token



In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map="auto",
    dtype=torch.bfloat16,
    trust_remote_code=True,
)

# 开启梯度检查点时的一些建议设置
model.enable_input_require_grads()
if hasattr(model, "config"):
    model.config.use_cache = False  # 与 gradient_checkpointing 兼容


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


NameError: name 'x' is not defined

In [None]:
# ----------------- 加载、处理数据集 -----------------
train_dataset_path = "./../datas/delicate_medical_r1_data/train.jsonl"
test_dataset_path = "./../datas/delicate_medical_r1_data/val.jsonl"

train_jsonl_new_path = "train_format.jsonl"
test_jsonl_new_path = "val_format.jsonl"

# 首次运行时转换数据格式
if not os.path.exists(train_jsonl_new_path):
    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)

if not os.path.exists(test_jsonl_new_path):
    dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)

# 得到训练集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(
    process_func, remove_columns=train_ds.column_names
)

# 得到验证集
eval_df = pd.read_json(test_jsonl_new_path, lines=True)
eval_ds = Dataset.from_pandas(eval_df)
eval_dataset = eval_ds.map(
    process_func, remove_columns=eval_ds.column_names
)



In [None]:
# ----------------- 训练配置 -----------------
args = TrainingArguments(
    output_dir="/root/autodl-tmp/output/Qwen3-1.7B",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",  # 修复: eval_strategy -> evaluation_strategy
    eval_steps=100,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=400,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="swanlab",
    run_name="qwen3-1.7B",
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# ----------------- 开始训练 -----------------
trainer.train()

# ----------------- 简单主观测试 -----------------
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:3]

test_text_list = []
for _, row in test_df.iterrows():
    instruction = row["instruction"]
    input_value = row["input"]

    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": input_value},
    ]

    response = predict(messages, model, tokenizer)

    response_text = (
        f"Question: {input_value}\n"
        f"LLM: {response}\n"
    )

    test_text_list.append(swanlab.Text(response_text))
    print(response_text)

swanlab.log({"Prediction": test_text_list})
swanlab.finish()