### Lora微调的线性实现方式
lora微调有两种东西要更新
1. X自身的值
2. W权重

$$
Y = WX + X \cdot (A B)
$$

需要判断X和W是否更新

#### 为什么要用nn.Parameters?
因为AB矩阵的weight值需要被反向传播而更新，用nn.rand的值默认不会被反向传播更新

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import math

class LinearLoralayer(nn.Module):
    def __init__(self, 
    in_features,
    out_features,
    rank,
    alpha,
    drop_out,
    merge=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.alpha = alpha
        self.merge = merge
        self.linear = nn.Linear(in_features,out_features)

        # 生成一个rank 用于模拟输入的数据
        # W的维度默认为(out_features , in_features) 因为 y=X*W^T
        # 不能对这部分的参数进行反向传播
        if drop_out > 0:
            self.drop_out = nn.Dropout(drop_out)
        else:
            # indentity不做变化
            self.drop_out = nn.Identity()

        if rank > 0 :
            self.lora_a = nn.Parameter(torch.zeros(out_features,rank))
            # lora_a 需要初始化为 高斯分布
            # @春风归无期 提醒我 @用代码打点酱油的chaofa : 在调用凯明初始化的时候注释里写的高斯分布，调用的却是均匀分布，而且参数a的值设置的是根号5，但a表示的是leaky relu的负斜率系数，一般是0.01这样的小值，不可能超过1
            nn.init.kaiming_normal_(self.lora_a, a=0.01)

            self.lora_b = nn.Parameter(torch.zeros(rank,in_features))
            self.scale = rank / alpha
            self.linear.weight.requires_grad = False


        if merge:
            self.merge_weight()
    
    def merge_weight(self):
        if self.merge and self.rank >0 :
            # 需要合并W权重
            # 这里不需要转置是因为W的矩阵就是(out_features,in_features)
            self.linear.weight.data += self.scale * (self.lora_a @ self.lora_b)


    def forward(self,X):
        if rank > 0 :
            # lora_a * lora_b (out_features * in_features)
            # X dimension is (batch,in_features)
            # y = Wx + x*AB
            output1 = self.linear(X)
            output2 = self.scale * (X @ (self.lora_a @ self.lora_b).T)
            output = output1 + output2
        else:
            # rank < 0 故不作lora,过一个线性层即可
            output = self.linear(X)
        output = self.drop_out(output)
        return output

batch_size = 128
seq_len = 64

out_features = 576
in_features = 512
dropout = 0.1
rank = 16 #一般为4-32
alpha = 32

X = torch.rand(batch_size,seq_len,in_features)
# torch.rand 创建的张量不会被更新
model = LinearLoralayer(
    in_features= in_features,
    out_features= out_features,
    drop_out=dropout,
    rank=rank,
    alpha=alpha,
    merge=True
)
output = model(X)
output.shape

torch.Size([128, 64, 576])