### Installation

In [14]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

In [2]:
%%capture
!pip install unsloth
# 安装最新版本的Unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

### Unsloth

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from huggingface_hub import login
# HT_KEY
# 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
hf_token = "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";
login(hf_token)

In [5]:
import wandb
# WB_KEY
# c4c6d76f96d159c331bf07fc4728de4ecfdde8e1
wb_token = "c4c6d76f96d159c331bf07fc4728de4ecfdde8e1";

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on BelleGroup train_1M_CN Dataset',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchord-0322[0m ([33mchord-0322-cc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## Loading the model and tokenizer

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Model inference before fine-tuning

In [7]:
prompt_style = """下面是一条描述任务的指令。
输出一个适当的完成任务的响应。
在回答之前，仔细思考问题，并创建一个循序渐进的思路链，以确保逻辑和准确的回答。

### instruction:
你是一名博览群书的学者，在各种学科方面都有一定的建树。
请回答下面的问题

### input:
{}

### output:
<think>{}"""

In [8]:
import torch

input = "解释一下量子力学的原理";

# 基于前面加载的模型进行推理
FastLanguageModel.for_inference(model)

model.config.torch_dtype = torch.float16

# token化前序定义的prompt_style
inputs = tokenizer([prompt_style.format(input, "")], return_tensors="pt").to("cuda")

# 执行模型
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
# 对模型输出进行token解码
response = tokenizer.batch_decode(outputs)
# 输出解码结果
print(response[0])

<｜begin▁of▁sentence｜>下面是一条描述任务的指令。
输出一个适当的完成任务的响应。
在回答之前，仔细思考问题，并创建一个循序渐进的思路链，以确保逻辑和准确的回答。

### instruction:
你是一名博览群书的学者，在各种学科方面都有一定的建树。
请回答下面的问题

### input:
解释一下量子力学的原理

### output:
<think>
嗯，我现在要解释一下量子力学的原理。首先，我得回忆一下量子力学的基本概念。量子力学是研究微观粒子行为的科学，它结合了经典物理和概率论。最著名的公式之一是舒尔公式，用来描述粒子波动性。

接下来，我想到了量子叠加。传统的经典物理认为粒子有确定的位置和动量，但在量子力学中，粒子的状态是一个叠加态，包含多个可能的位置和动量的概率波动。这有点像波浪在海边的叠加，粒子的位置可能在多个地方同时存在，但概率告诉我们在哪些地方有更高的可能性。

然后是测量的问题。量子力学中，测量会改变系统的状态，这就是所谓的“测量问题”。如果你试图观测一个粒子的位置和动量，测量时它们不再同时具有确定的值，而是会因为测量的干扰而改变状态。这让人联想到纠缠现象，两个粒子无论相距多远，测量其中一个就会立即影响另一个。

还有纠缠态，这是两个或多个粒子之间的纠缠关系。它们的状态在被测量时会同步，无论距离多远。这有点像超乎寻常的瞬间沟通，但科学上认为这只是概率波的相互关联，而不是真正的信息传递。

最后，我需要把这些概念整理成一个连贯的解释。从基本原理、叠加态、测量问题到纠缠态，逐步引导读者理解量子力学的独特之处。可能还需要提到量子力学在科技中的应用，比如计算机、通信和材料科学，这样读者可以理解其重要性。

总之，量子力学颠覆了我们对世界的传统认知，展示了微观世界的神奇和复杂。通过这些原理，科学家们不断探索未知，推动技术进步。
</think>

量子力学是研究微观粒子行为的科学，它结合了经典物理和概率论。其核心原理包括：

1. **叠加态**：粒子的位置和动量可以同时存在多种可能性，形成概率波动。
2. **测量问题**：测量会改变粒子的状态，导致观测值的不确定性。
3. **纠缠态**：粒子间存在相互关联，无论距离多远，测量会同步。

这些原理展示了微观世界的神奇和复杂，推动了科技的进步。<｜end▁of▁sentence｜>

## Loading and processing the dataset

In [9]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt_style.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }


In [10]:
from datasets import load_dataset
# dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT","en", split = "train[0:500]",trust_remote_code=True)
dataset = load_dataset("BelleGroup/train_1M_CN","default", split = "train[0:500]",trust_remote_code=True)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset["text"][0]

README.md:   0%|          | 0.00/941 [00:00<?, ?B/s]

Belle_open_source_1M.json:   0%|          | 0.00/458M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/917424 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

'下面是一条描述任务的指令。\n输出一个适当的完成任务的响应。\n在回答之前，仔细思考问题，并创建一个循序渐进的思路链，以确保逻辑和准确的回答。\n\n### instruction:\n你是一名博览群书的学者，在各种学科方面都有一定的建树。\n请回答下面的问题\n\n### input:\n判断给定的文章是否符合语法规则。如果不符合，请提供修改建议。\n下面是一篇文章的开头: "为了探讨这个主题，本文将提供一系列数据和实例，以证明这一观点。"\n\n\n### output:\n<think><｜end▁of▁sentence｜>'

## Setting up the model

In [11]:
# 对模型进行低秩自适应（LoRA）微调
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # 设置低秩矩阵的秩（rank），值越小参数量越少
    # 目标模块列表，指定哪些层的权重需要被 LoRA 微调
    target_modules=[
        "q_proj",
         "k_proj",
         "v_proj",
         "o_proj",
         "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,  # LoRA 的缩放因子，控制低秩更新的幅度
    lora_dropout=0,  # LoRA 层的 dropout 概率，用于防止过拟合
    bias="none",  # 指定是否对 LoRA 层添加偏置项
    use_gradient_checkpointing="unsloth",  # 是否使用梯度检查点技术以节省显存
    random_state=3407,  # 随机种子，确保实验的可复现性
    use_rslora=False,  # 是否使用 RSLoRA（一种改进的 LoRA 变体），
    loftq_config=None,  # 配置 LoftQ（一种量化方法）
)


Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

AttributeError: 'PeftModelForCausalLM' object has no attribute '_unwrapped_old_generate'

## Model training

In [None]:
trainer_stats = trainer.train()

In [None]:
# Save the fine-tuned model
wandb.finish()

## Model inference after fine-tuning

In [None]:
input = "解释一下量子力学的原理";
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(input, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0])


In [None]:
input = "写一首描述春夏秋冬的诗";
inputs = tokenizer([prompt_style.format(input, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0])

## Saving the model locally

In [None]:
new_model_online = "FastHappySharp/DeepSeek-R1-BellGroup"
new_model_local = "DeepSeek-R1-BellGroup"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

## Pushing the model to Hugging Face hub

In [None]:
model.push_to_hub(new_model_online) # Online saving
tokenizer.push_to_hub(new_model_online) # Online saving

In [None]:
model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")