In [1]:
# !nvcc --version
# import torch
# print(torch.__version__)

In [2]:
!pip install unsloth



In [3]:
import torch
from unsloth import FastLanguageModel
max_seq_length = 2048
dtype = torch.float16  # 适用于 4-bit 量化
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [4]:
prompt_style = """Below is an instruction that describes a task, paired with
an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by
step chain of thoughts to ensure a logical and accurate response.
### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning,
diagnostics, and treatment planning.
Please answer the following medical question.
### Question:
{}
### Response:
<think>{}"""

In [5]:
question = "A 61-year-old woman with a long history of involuntary urine lossduring activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"

FastLanguageModel.for_inference(model)#让模型进入 推理模式（inference mode），关闭梯度计算，以提高推理速度。

inputs = tokenizer([prompt_style.format(question, "")],return_tensors="pt").to("cuda") # str.format() 是用来替换字符串中的 {} 占位符的。

outputs = model.generate( 
input_ids=inputs.input_ids, 
attention_mask=inputs.attention_mask, 
max_new_tokens=1200, 
use_cache=True, 
) 

response = tokenizer.batch_decode(outputs) #把模型的输出 token 转换回可读的文本。
print(response[0].split("### Response:")[1])


<think>
Okay, so I have this medical question to answer. Let me try to break it down step by step. The patient is a 61-year-old woman with a history of involuntary urine loss during activities like coughing or sneezing, but she doesn't leak at night. She undergoes a gynecological exam and a Q-tip test. The question is asking what cystometry would most likely reveal about her residual volume and detrusor contractions.

First, I need to understand the context. Involuntary urine loss during activities like coughing or sneezing suggests a possible issue with the lower urinary tract, particularly the bladder. Since it's not happening at night, it's less likely to be related to incontinence due to aging or other night-time factors.

The gynecological exam and Q-tip test are mentioned. I recall that the Q-tip test is often used to assess urethral function. It involves inserting a Q-tip catheter into the urethra and measuring the residual volume in the bladder and the pressure generated durin

In [6]:
train_prompt_style = """Below is an instruction that describes a task, paired 
with an input that provides further context.  
Write a response that appropriately completes the request.  
Before answering, think carefully about the question and create a step-by
step chain of thoughts to ensure a logical and accurate response. 
### Instruction: 
You are a medical expert with advanced knowledge in clinical reasoning, 
diagnostics, and treatment planning.  
Please answer the following medical question.  
### Question: 
{} 
### Response: 
<think> 
{} 
</think> 
{}""" 


In [7]:
EOS_TOKEN = tokenizer.eos_token  
# Must add EOS_TOKEN
 
def formatting_prompts_func(examples): 
    inputs = examples["Question"] 
    cots = examples["Complex_CoT"] 
    outputs = examples["Response"] 
    texts = [] 
    for input, cot, output in zip(inputs, cots, outputs): 
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN 
        texts.append(text) 
    return { 
        "text": texts, 
    } 

In [8]:
from datasets import load_dataset 
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT","en", 
split = "train[0:500]",trust_remote_code=True) #只取500条作为训练数据
dataset = dataset.map(formatting_prompts_func, batched = True,) 
dataset.features,dataset["text"][0]

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

({'Question': Value(dtype='string', id=None),
  'Complex_CoT': Value(dtype='string', id=None),
  'Response': Value(dtype='string', id=None),
  'text': Value(dtype='string', id=None)},
 "Below is an instruction that describes a task, paired \nwith an input that provides further context.  \nWrite a response that appropriately completes the request.  \nBefore answering, think carefully about the question and create a step-by\nstep chain of thoughts to ensure a logical and accurate response. \n### Instruction: \nYou are a medical expert with advanced knowledge in clinical reasoning, \ndiagnostics, and treatment planning.  \nPlease answer the following medical question.  \n### Question: \nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions? \n### R

In [9]:
model = FastLanguageModel.get_peft_model( 
    model, 
    r=16,   
    target_modules=[ 
        "q_proj", 
        "k_proj", 
        "v_proj", 
        "o_proj", 
        "gate_proj", 
        "up_proj", 
        "down_proj", 
    ], 
    lora_alpha=16, 
    lora_dropout=0,   
    bias="none",   
    use_gradient_checkpointing="unsloth",  
    # True or "unsloth" for very long context
    random_state=3407, 
    use_rslora=False,   
    loftq_config=None, 
) 

Unsloth 2025.3.14 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
from trl import SFTTrainer 
from transformers import TrainingArguments 
from unsloth import is_bfloat16_supported 
trainer = SFTTrainer( 
    model=model, 
    tokenizer=tokenizer, 
    train_dataset=dataset, 
    dataset_text_field="text", 
    max_seq_length=max_seq_length, 
    dataset_num_proc=2, 
    args=TrainingArguments( 
        per_device_train_batch_size=2, 
        gradient_accumulation_steps=4, 
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        # warmup_steps=5, 
        # max_steps=60, #max_steps=60：总共训练 60 步（适用于短暂测试，完整训练一般使用 num_train_epochs）
        num_train_epochs = 1,
        warmup_ratio = 0.1,
        learning_rate=2e-4, 
        fp16=not is_bfloat16_supported(), #fp16=not is_bfloat16_supported()：如果不支持 bfloat16，就使用 fp16。
        bf16=is_bfloat16_supported(), #bf16=is_bfloat16_supported()：如果支持 bfloat16，就使用 bf16。
        logging_steps=10, 
        optim="adamw_8bit",#采用 AdamW 优化器的 8-bit 版本（减少显存占用，提高大模型训练效率） 
        weight_decay=0.01, 
        lr_scheduler_type="linear", 
        seed=3407, 
        output_dir="outputs", 
        report_to="none"  # 关闭 wandb
    ), 
) 

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 62
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0014
20,1.4647
30,1.3905
40,1.2948
50,1.3275
60,1.2962


In [12]:
question = '''A 61-year-old woman with a long history of involuntary urine loss 
during activities like coughing or sneezing but no leakage at night undergoes 
a gynecological exam and Q-tip test. Based on these findings, what would 
cystometry most likely reveal about her residual volume and detrusor 
contractions?'''

FastLanguageModel.for_inference(model)  
# Unsloth has 2x faster inference!
 
inputs = tokenizer([prompt_style.format(question, "")], 
return_tensors="pt").to("cuda") 
outputs = model.generate( 
input_ids=inputs.input_ids, 
attention_mask=inputs.attention_mask, 
max_new_tokens=1200, 
use_cache=True, 
) 

response = tokenizer.batch_decode(outputs) 
print(response[0].split("### Response:")[1])


<think> 
Okay, let's see. We have a 61-year-old woman who has been dealing with involuntary urine loss whenever she coughs or sneezes. That sounds like a classic case of urgency incontinence. This kind of situation usually happens when there's an issue with the bladder's capacity, like it's not holding up against the pressure from coughing.

Now, when they did the Q-tip test, I'm guessing it came back negative. That means the urethra wasn't obstructed, so the problem isn't in the neck of the bladder or urethra. If the Q-tip test had been positive, we'd know it was about the urethral obstruction, but since it's negative, we can rule that out. So, we're dealing with something else here.

Given that she's experiencing urgency without any leakage at night, it makes me think about the bladder's capacity. It sounds like her bladder can't hold the usual amount of urine under pressure, which is typical for urgency incontinence. So, I'm pretty sure her bladder can't hold much volume before it 

In [13]:
new_model_local = "DeepSeek-R1-Medical-COT" 
model.save_pretrained(new_model_local)  
tokenizer.save_pretrained(new_model_local) 
model.save_pretrained_merged(new_model_local, tokenizer, save_method = 
"merged_16bit",)

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 18.6 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 38%|███▊      | 12/32 [00:00<00:00, 26.60it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:18<00:00,  1.69it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00004-of-00004.bin...
Done.


用 Kaggle Secrets 或 环境变量 来存储 Hugging Face Token，而不是直接写在代码里

In [14]:
from huggingface_hub import login 
from kaggle_secrets import UserSecretsClient 
user_secrets = UserSecretsClient() 
hf_token = user_secrets.get_secret("HF_TOKEN")  # 这里 "HF_TOKEN" 是你存储的密钥名称
login(hf_token) 

In [15]:
new_model_online = "MelodyOfTears/DeepSeek-R1-Medical-COT" 
model.push_to_hub(new_model_online) 
tokenizer.push_to_hub(new_model_online) 
model.push_to_hub_merged(new_model_online, tokenizer, save_method = 
"merged_16bit") 

README.md:   0%|          | 0.00/632 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/MelodyOfTears/DeepSeek-R1-Medical-COT


No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: You are pushing to hub in Kaggle environment.
To save memory, we shall move MelodyOfTears/DeepSeek-R1-Medical-COT to /tmp/DeepSeek-R1-Medical-COT
Unsloth: Will remove a cached repo with size 1.6K


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 17.85 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:18<00:00,  1.71it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving /tmp/DeepSeek-R1-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Medical-COT/pytorch_model-00004-of-00004.bin...


  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/MelodyOfTears/DeepSeek-R1-Medical-COT
