# 手搓
有点艰难，主要还是pytorch维度问题

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [43]:
class LinearLoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank, lora_alpha, dropout_rate, merge=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.lora_alpha = lora_alpha
        self.dropout_rate = dropout_rate
        self.merge = merge

        self.linear = nn.Linear(in_features, out_features)
        # self.linear.weight 的 shape 是 (out_features, in_features)

        if rank > 0:
            # A and B is not linear layers, but parameters only.
            # self.lora_A = nn.Linear(in_features, rank, bias=False)
            # # 高斯分布
            # nn.init.normal_(self.lora_A.weight, mean=0.0, std=1.0)
            # self.lora_B = nn.Linear(rank, out_features, bias=False)

            self.lora_A = nn.Parameter(torch.empty(rank, in_features))
            self.lora_B = nn.Parameter(torch.empty(out_features, rank))
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)
            self.scaling = self.lora_alpha / self.rank

        self.linear.weight.requires_grad = False
        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()

        # merge
        if merge:
            self.linear.weight.data += self.lora_B @ self.lora_A * self.scaling
    
    def forward(self, x):
        if self.rank > 0:
            print(x.shape, x.T.shape)
            output_part1 = self.linear(x)
            output_part2 = (x @ (self.lora_B @ self.lora_A).T) * self.scaling
            print(output_part1.shape, output_part2.shape)
            output = output_part1 + output_part2
        else:
            output = self.linear(x)

        output = self.dropout(output)
        return output

In [52]:
# 写一段测试代码
# Test the LoRALinear layer
batch_size = 32
seq_len = 128
in_features = 768
out_features = 512
rank = 8
lora_alpha = 16
dropout = 0.1

# Create a test input
x = torch.randn(batch_size, seq_len, in_features)

# Test regular mode (no merge)
lora_layer = LinearLoRALayer(
    in_features=in_features,
    out_features=out_features,
    rank=rank,
    lora_alpha=lora_alpha,
    dropout_rate=dropout,
    merge=False
)

# Forward pass
output = lora_layer(x)
print(f"Output shape (no merge): {output.shape}")  # Should be [batch_size, seq_len, out_features]

# Test regular mode (no merge)
lora_layer = LinearLoRALayer(
    in_features=in_features,
    out_features=out_features,
    rank=rank,
    lora_alpha=lora_alpha,
    dropout_rate=dropout,
    merge=True
)

# Forward pass
output = lora_layer(x)
print(f"Output shape (no merge): {output.shape}") 

Output shape (no merge): torch.Size([32, 128, 512])
Output shape (no merge): torch.Size([32, 128, 512])


In [13]:
# torch.tensor() 只创建torch.FloatTensor类型的张量，是torch.empty() 的特例，empty（）返回一个包含未初始化数据的张量
torch.empty(in_features, rank)

tensor([[3.7793e-07, 0.0000e+00, 1.4013e-45,  ..., 0.0000e+00, 3.7793e-07,
         0.0000e+00],
        [3.3631e-44, 6.5711e+05, 1.1502e-07,  ..., 1.2125e+25, 4.7426e+30,
         1.7237e+25],
        [3.6434e-44, 5.6052e-45, 3.6434e-44,  ..., 1.8058e+28, 3.7793e-07,
         0.0000e+00],
        ...,
        [1.5975e-43, 0.0000e+00, 3.7862e-07,  ..., 0.0000e+00, 3.7863e-07,
         0.0000e+00],
        [1.5975e-43, 0.0000e+00, 3.7862e-07,  ..., 0.0000e+00, 3.7863e-07,
         0.0000e+00],
        [1.5975e-43, 0.0000e+00, 3.7862e-07,  ..., 0.0000e+00, 3.7863e-07,
         0.0000e+00]])

In [20]:
linear = nn.Linear(3, 4)
linear.weight.shape

torch.Size([4, 3])

In [22]:
x = torch.randn(3, 4)
y = torch.randn(3)
print(x.T @ y)
print(y @ x)

tensor([-0.7755, -4.0443,  1.3888,  3.9942])
tensor([-0.7755, -4.0443,  1.3888,  3.9942])


# github 官方代码

In [59]:
class LoRALayer():
    def __init__(
        self, 
        r: int, 
        lora_alpha: int, 
        lora_dropout: float,
        merge_weights: bool,
    ):
        self.r = r
        self.lora_alpha = lora_alpha
        # Optional dropout
        if lora_dropout > 0.:
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_dropout = lambda x: x
        # Mark the weight as unmerged
        self.merged = True
        self.merge_weights = merge_weights

class Linear(nn.Linear, LoRALayer):
    # LoRA implemented in a dense layer
    def __init__(
        self, 
        in_features: int, 
        out_features: int, 
        r: int = 0, 
        lora_alpha: int = 1, 
        lora_dropout: float = 0.,
        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        merge_weights: bool = True,
        **kwargs
    ):
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
                           merge_weights=merge_weights)

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

    def reset_parameters(self):
        nn.Linear.reset_parameters(self)
        if hasattr(self, 'lora_A'):
            # initialize B the same way as the default for nn.Linear and A to zero
            # this is different than what is described in the paper but should not affect performance
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)

    def train(self, mode: bool = True):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        nn.Linear.train(self, mode)
        if mode:
            if self.merge_weights and self.merged:
                # Make sure that the weights are not merged
                if self.r > 0:
                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
                    print(self.weight.data.shape)
                    print(self.lora_B.shape)
                    print(self.lora_A.shape)
                    print((self.lora_B @ self.lora_A).shape)
                    print((T(self.lora_B @ self.lora_A)).shape)
                self.merged = False
        else:
            if self.merge_weights and not self.merged:
                # Merge the weights and mark it
                if self.r > 0:
                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
                self.merged = True       

    def forward(self, x: torch.Tensor):
        # oh, 原来如此
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        
        if self.r > 0 and not self.merged:
            result = F.linear(x, T(self.weight), bias=self.bias)            
            result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
            return result
        else:
            return F.linear(x, T(self.weight), bias=self.bias)

In [60]:
# 写一段测试代码
# Test the LoRALinear layer
batch_size = 32
seq_len = 128
in_features = 768
out_features = 512
rank = 8
lora_alpha = 16
dropout = 0.1

# Create a test input
x = torch.randn(batch_size, seq_len, in_features)

# Test regular mode (no merge)
lora_layer = Linear(
    in_features=in_features,
    out_features=out_features,
    r=rank,
    lora_alpha=lora_alpha,
    merge_weights=False
)

lora_layer.train()

# Forward pass
output = lora_layer(x)
print(f"Output shape (no merge): {output.shape}")  # Should be [batch_size, seq_len, out_features]

# Test regular mode (no merge)
lora_layer = Linear(
    in_features=in_features,
    out_features=out_features,
    r=rank,
    lora_alpha=lora_alpha,
    merge_weights=True
)

lora_layer.train()

# Forward pass
output = lora_layer(x)
print(f"Output shape (no merge): {output.shape}") 

Output shape (no merge): torch.Size([32, 128, 512])
torch.Size([512, 768])
torch.Size([512, 8])
torch.Size([8, 768])
torch.Size([512, 768])
torch.Size([512, 768])
Output shape (no merge): torch.Size([32, 128, 512])


In [63]:
x = torch.randn(2, 3)
print(x.shape, x.T.shape, T(x).shape)

NameError: name 'T' is not defined