In [1]:
from transformers.models.qwen3 import Qwen3ForCausalLM
from transformers import AutoTokenizer
from peft import LoraConfig, TaskType
from peft import get_peft_model

In [2]:
from transformers import modeling_utils
if modeling_utils.ALL_PARALLEL_STYLES is None:
    modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", 'rowwise']

In [3]:
qwen3 = '/Users/ethanliu/Documents/models/Qwen/Qwen3-0.6B'

In [4]:
# tokenizer = AutoTokenizer.from_pretrained(model_path)
qwen3_model = Qwen3ForCausalLM.from_pretrained(
    qwen3,
    torch_dtype='float16',
    device_map='mps')
# qwen3_model

In [5]:
# Note: 下面的total_param和origin_train_param要提前定义查看，不然后面使用lora会共享

In [6]:
total_param = sum([param.numel() for name, param in qwen3_model.named_parameters()]) / 1024 / 1024 / 1024
total_param

0.55511474609375

In [7]:
origin_train_param = sum(param.numel() for name, param in qwen3_model.named_parameters() if param.requires_grad==True) / 1024 / 1024 / 1024
origin_train_param

0.55511474609375

In [23]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"])

In [9]:
lora_model = get_peft_model(model=qwen3_model, peft_config=lora_config)

In [10]:
# lora_model

In [11]:
lora_param = sum([param.numel() for name, param in lora_model.named_parameters()]) / 1024 / 1024 / 1024
lora_param

0.5572509765625

In [12]:
new_total_param = sum([param.numel() for name, param in qwen3_model.named_parameters()]) / 1024 / 1024 / 1024
new_total_param

0.5572509765625

In [14]:
train_lora_param = sum(param.numel() for name, param in lora_model.named_parameters() if param.requires_grad==True) / 1024 / 1024/ 1024
train_lora_param

0.00213623046875

In [19]:
(train_lora_param / origin_train_param) * 100

0.38482682792743267

In [31]:
for name, param in qwen3_model.named_parameters():
    print(name, '---', param.dtype)
    if 'model.layers.0.self_attn.q_proj.lora_A.default.weight' == name:
        print(param.max(), '   grad_is:', param.requires_grad)
    if 'model.layers.0.self_attn.q_proj.base_layer.weight' == name:
        print(param.max(), '   grad_is:', param.requires_grad)
    if 'layers.1' in name:
        break

model.embed_tokens.weight --- torch.float16
model.layers.0.self_attn.q_proj.base_layer.weight --- torch.float16
tensor(0.6445, device='mps:0', dtype=torch.float16)    grad_is: False
model.layers.0.self_attn.q_proj.lora_A.default.weight --- torch.float32
tensor(0.0312, device='mps:0', grad_fn=<MaxBackward1>)    grad_is: True
model.layers.0.self_attn.q_proj.lora_B.default.weight --- torch.float32
model.layers.0.self_attn.k_proj.base_layer.weight --- torch.float16
model.layers.0.self_attn.k_proj.lora_A.default.weight --- torch.float32
model.layers.0.self_attn.k_proj.lora_B.default.weight --- torch.float32
model.layers.0.self_attn.v_proj.base_layer.weight --- torch.float16
model.layers.0.self_attn.v_proj.lora_A.default.weight --- torch.float32
model.layers.0.self_attn.v_proj.lora_B.default.weight --- torch.float32
model.layers.0.self_attn.o_proj.base_layer.weight --- torch.float16
model.layers.0.self_attn.o_proj.lora_A.default.weight --- torch.float32
model.layers.0.self_attn.o_proj.lora_B