## LoRA

视频一：https://www.bilibili.com/video/BV1fHmkYyE2w/?spm_id_from=333.1007.top_right_bar_window_default_collection.content.click&vd_source=071b23b9c7175dbaf674c65294124341  
视频二：https://www.bilibili.com/video/BV1YVpte7EFL/?spm_id_from=333.1007.top_right_bar_window_history.content.click&vd_source=071b23b9c7175dbaf674c65294124341  
视频三：https://www.bilibili.com/video/BV1dr421w7J5/?spm_id_from=333.337.search-card.all.click&vd_source=071b23b9c7175dbaf674c65294124341

博客一：https://zhuanlan.zhihu.com/p/658007966


In [1]:
# 视频一

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class LinearLoRALayer(nn.Module):
    def __init__(self, in_features, out_features, r=0.5, lora_alpha=1, lora_dropout=0.0, merge=False, **kwargs):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.r = r
        self.lora_alpha = lora_alpha
        self.merge = merge
        self.lora_dropout = lora_dropout

        # linear: weight的shape为 [out_features, in_features]
        # input x shape是 [batch_size, seq_len, in_features]
        # 计算过程是 x @ weight.T
        # 所以 weight 的shape 是 [out_features, in_features]
        self.linear = nn.Linear(in_features, out_features)

        if r > 0:
            self.lora_a = nn.Parameter(torch.randn((out_features, r)))
            # 高斯分布
            nn.init.kaiming_uniform_(self.lora_a, a=math.sqrt(5))

            self.lora_b = nn.Parameter(torch.randn((r, in_features)))
            self.scale = lora_alpha / r

            self.linear.weight.requires_grad = False

        self.dropout = nn.Dropout(lora_dropout) if lora_dropout > 0 else nn.Identity()

        if merge:
            self.merge_weight()
    
    def merge_weight(self):
        if self.merge and self.r > 0:
            self.linear.weight.data += self.scale * (self.lora_a @ self.lora_b)
    
    def unmerge_weight(self):
        if self.merge and self.r > 0:
            self.linear.weight.data -= self.scale * (self.lora_a @ self.lora_b)

    def forward(self, x):
        # x shape: [batch_size, seq_len, in_features]

        if self.r > 0:
            output_part1 = self.linear(x)
            output_part2 = self.scale * (x @ (self.lora_a @ self.lora_b).T)
            output = output_part1 + output_part2
        else:
            output = self.linear(x)

        return self.dropout(output)
    

# 视频一

# 写一段测试代码
# Test the LoRALinear layer
batch_size = 32
seq_len = 128
in_features = 768
out_features = 512
rank = 8
lora_alpha = 16
dropout = 0.1

# Create a test input
x = torch.randn(batch_size, seq_len, in_features)

# Test regular mode (no merge)
lora_layer = LinearLoRALayer(
    in_features=in_features,
    out_features=out_features,
    r=rank,
    lora_alpha=lora_alpha,
    dropout=dropout,
    merge=False
)

# Forward pass
output = lora_layer(x)
print(f"Output shape (no merge): {output.shape}")

# Test merged mode
lora_layer_merged = LinearLoRALayer(
    in_features=in_features,
    out_features=out_features,
    r=rank,
    lora_alpha=lora_alpha,
    dropout=dropout,
    merge=True
)

# Forward pass with merged weights
output_merged = lora_layer_merged(x)
print(f"Output shape (merged): {output_merged.shape}")

# Test weight merging/unmerging
lora_layer.merge_weight()
output_after_merge = lora_layer(x)
lora_layer.unmerge_weight()
output_after_unmerge = lora_layer(x)

print("Max difference after merge/unmerge cycle:", 
      torch.max(torch.abs(output - output_after_unmerge)).item())



Output shape (no merge): torch.Size([32, 128, 512])
Output shape (merged): torch.Size([32, 128, 512])
Max difference after merge/unmerge cycle: 0.0


In [2]:
# 视频二
from transformers import AutoModelForCausalLM
# Check for available GPU
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

# 加载本地模型和分词器
model_path = "/home/hpclp/disk/q/models/Qwen2.5-7B-Instruct"  # 替换为你的本地模型路径
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [3]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
# 视频二

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.W_a = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.W_b = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = (x @ self.W_a @ self.W_b) * self.alpha
        return x

class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [5]:
# 视频二

lora_rank = 8
lora_alpha = 16
lora_q = True
lora_k = True
lora_v = True
lora_o = True
lora_mlp = True

# # Apply LoRA to the specified layers
# for name, module in model.named_modules():
#     print(name, module)

for layer in model.model.layers:
    # print(layer.self_attn.q_proj)
    if lora_q:
        layer.self_attn.q_proj = LinearWithLoRA(layer.self_attn.q_proj, lora_rank, lora_alpha)
    if lora_k:
        layer.self_attn.k_proj = LinearWithLoRA(layer.self_attn.k_proj, lora_rank, lora_alpha)
    if lora_v:
        layer.self_attn.v_proj = LinearWithLoRA(layer.self_attn.v_proj, lora_rank, lora_alpha)
    if lora_o:
        layer.self_attn.o_proj = LinearWithLoRA(layer.self_attn.o_proj, lora_rank, lora_alpha)
    if lora_mlp:
        # 替换MLP中的线性层
        layer.mlp.gate_proj = LinearWithLoRA(layer.mlp.gate_proj, lora_rank, lora_alpha)
        layer.mlp.up_proj = LinearWithLoRA(layer.mlp.up_proj, lora_rank, lora_alpha)
        layer.mlp.down_proj = LinearWithLoRA(layer.mlp.down_proj, lora_rank, lora_alpha)


In [None]:
# 只训练LoRA参数
for name, param in model.named_parameters():
    if 'lora' not in name:
        param.requires_grad = False

In [6]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): LinearWithLoRA(
            (linear): Linear(in_features=3584, out_features=3584, bias=True)
            (lora): LoRALayer()
          )
          (k_proj): LinearWithLoRA(
            (linear): Linear(in_features=3584, out_features=512, bias=True)
            (lora): LoRALayer()
          )
          (v_proj): LinearWithLoRA(
            (linear): Linear(in_features=3584, out_features=512, bias=True)
            (lora): LoRALayer()
          )
          (o_proj): LinearWithLoRA(
            (linear): Linear(in_features=3584, out_features=3584, bias=False)
            (lora): LoRALayer()
          )
        )
        (mlp): Qwen2MLP(
          (gate_proj): LinearWithLoRA(
            (linear): Linear(in_features=3584, out_features=18944, bias=False)
            (lora): LoRALayer()
  

## 视频三

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class LinearLoRALayer(nn.Module):
    def __init__(self, linear, r=8, lora_alpha=16, lora_dropout=0.1, merge=True, **kwargs):
        super().__init__()
        self.r = r
        self.lora_alpha = lora_alpha
        self.merge = merge
        self.lora_dropout = lora_dropout
        self.linear1 = linear

        # linear: weight的shape为 [out_features, in_features]
        # input x shape是 [batch_size, seq_len, in_features]
        # 计算过程是 x @ weight.T
        # 所以 weight 的shape 是 [out_features, in_features]

        if r > 0:
            self.lora_a = nn.Parameter(torch.zeros((r, linear.in_features)))
            self.lora_b = nn.Parameter(torch.zeros((linear.out_features, r)))
            self.scale = lora_alpha / r
            self.linear1.weight.requires_grad = False

        self.dropout = nn.Dropout(lora_dropout) if lora_dropout > 0 else nn.Identity()

        self.initial_weights()


    def initial_weights(self):
        nn.init.kaiming_normal_(self.lora_a, a=math.sqrt(5))
        nn.init.zeros_(self.lora_b)
    

    def forward(self, x):
        # x shape: [batch_size, seq_len, in_features]

        if self.r > 0 and self.merge:
            # output = F.linear(x, self.linear.weight, bias=self.linear.bias)
            # output += (self.lora_dropout(x) @ (self.lora_b @ self.lora_a).T) * self.scale
            output = F.linear(x, self.linear1.weight + self.scale * (self.lora_b @ self.lora_a), bias=self.linear1.bias)
            output = self.dropout(output)
            return output
        else:
            return self.dropout(self.linear1(x))



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank, alpha):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.alpha = alpha

        # linear: weight的shape为 [out_features, in_features]
        # input x shape是 [batch_size, seq_len, in_features]
        # 计算过程是 x @ weight.T
        # 所以 weight 的shape 是 [out_features, in_features]
        self.lora_a = nn.Parameter(torch.zeros((rank, in_features)))
        self.lora_b = nn.Parameter(torch.zeros((out_features, rank)))
        self.scale = lora_alpha / rank


        self.initial_weights()


    def initial_weights(self):
        nn.init.kaiming_normal_(self.lora_a, a=math.sqrt(5))
        nn.init.zeros_(self.lora_b)

    def forward(self, x):
        return self.scale * (x @ (self.lora_b @ self.lora_a).T)


class LinearLoRALayer(nn.Module):
    def __init__(self, linear, r=8, lora_alpha=16, lora_dropout=0.1, merge=True, **kwargs):
        super().__init__()
        self.linear = linear
        self.merge = merge
        self.r = r
        
        if self.r > 0:
            self.lora_layer = LoRALayer(linear.in_features, linear.out_features, r, lora_alpha)
            self.linear.weight.requires_grad = False

        self.dropout = nn.Dropout(lora_dropout) if lora_dropout > 0 else nn.Identity()

    def forward(self, x):
        # x shape: [batch_size, seq_len, in_features]

        if self.r > 0 and self.merge:
            output = F.linear(x, self.linear.weight, bias=self.linear.bias)     # 不会进行梯度更新
            output += self.lora_layer(x)
            return self.dropout(output)
        else:
            return self.linear(x)



In [2]:
from transformers import AutoModelForCausalLM
# Check for available GPU
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

# 加载本地模型和分词器
model_path = "/home/hpclp/disk/q/models/Qwen2.5-0.5B-Instruct"  # 替换为你的本地模型路径
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [None]:
lora_rank = 8
lora_alpha = 16
lora_q = True
lora_k = False
lora_v = True
lora_o = True
lora_mlp = True

# # Apply LoRA to the specified layers
# for name, module in model.named_modules():
#     print(name, module)

for layer in model.model.layers:
    # print(layer.self_attn.q_proj)
    if lora_q:
        layer.self_attn.q_proj = LinearLoRALayer(layer.self_attn.q_proj, lora_rank, lora_alpha)
    if lora_k:
        layer.self_attn.k_proj = LinearLoRALayer(layer.self_attn.k_proj, lora_rank, lora_alpha)
    if lora_v:
        layer.self_attn.v_proj = LinearLoRALayer(layer.self_attn.v_proj, lora_rank, lora_alpha)
    if lora_o:
        layer.self_attn.o_proj = LinearLoRALayer(layer.self_attn.o_proj, lora_rank, lora_alpha)
    if lora_mlp:
        # 替换MLP中的线性层
        layer.mlp.gate_proj = LinearLoRALayer(layer.mlp.gate_proj, lora_rank, lora_alpha)
        layer.mlp.up_proj = LinearLoRALayer(layer.mlp.up_proj, lora_rank, lora_alpha)
        layer.mlp.down_proj = LinearLoRALayer(layer.mlp.down_proj, lora_rank, lora_alpha)

In [4]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): LinearLoRALayer(
            (linear): Linear(in_features=896, out_features=896, bias=True)
            (lora_layer): LoRALayer()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): LinearLoRALayer(
            (linear): Linear(in_features=896, out_features=128, bias=True)
            (lora_layer): LoRALayer()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (o_proj): LinearLoRALayer(
            (linear): Linear(in_features=896, out_features=896, bias=False)
            (lora_layer): LoRALayer()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (mlp): Qwen2MLP(
          (gate_proj): LinearLoRALayer(
            (linear):

In [5]:
for name, param in model.named_parameters():
    if 'lora' not in name:
        param.requires_grad = False

In [8]:
for name, param in model.named_parameters():
    if param.requires_grad == True:
        print(name)

model.layers.0.self_attn.q_proj.lora_layer.lora_a
model.layers.0.self_attn.q_proj.lora_layer.lora_b
model.layers.0.self_attn.v_proj.lora_layer.lora_a
model.layers.0.self_attn.v_proj.lora_layer.lora_b
model.layers.0.self_attn.o_proj.lora_layer.lora_a
model.layers.0.self_attn.o_proj.lora_layer.lora_b
model.layers.0.mlp.gate_proj.lora_layer.lora_a
model.layers.0.mlp.gate_proj.lora_layer.lora_b
model.layers.0.mlp.up_proj.lora_layer.lora_a
model.layers.0.mlp.up_proj.lora_layer.lora_b
model.layers.0.mlp.down_proj.lora_layer.lora_a
model.layers.0.mlp.down_proj.lora_layer.lora_b
model.layers.1.self_attn.q_proj.lora_layer.lora_a
model.layers.1.self_attn.q_proj.lora_layer.lora_b
model.layers.1.self_attn.v_proj.lora_layer.lora_a
model.layers.1.self_attn.v_proj.lora_layer.lora_b
model.layers.1.self_attn.o_proj.lora_layer.lora_a
model.layers.1.self_attn.o_proj.lora_layer.lora_b
model.layers.1.mlp.gate_proj.lora_layer.lora_a
model.layers.1.mlp.gate_proj.lora_layer.lora_b
model.layers.1.mlp.up_proj.l