In [None]:
import torch.nn as nn

class LoRALayer():
    def __init__(
        self, 
        r: int, 
        lora_alpha: int, 
        lora_dropout: float,
        merge_weights: bool,
    ):
        self.r = r
        self.lora_alpha = lora_alpha
        # Optional dropout
        if lora_dropout > 0.:
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_dropout = lambda x: x
        # Mark the weight as unmerged
        self.merged = False
        self.merge_weights = merge_weights

`r` : the rank of the low-rank matrices `A` and `B`

$$
\begin{align*}
    h=W_{0} x+\Delta W x=W_{0} x+B A x \tag{3}
\end{align*}
$$ 

`lora_alpha` : scale $\Delta W x$ by $\frac{\alpha}{r}$, where $\alpha$ is a constant in $r$

`lora_dropout` : probability dropping the input of the LoRA path

`merge_weights` : whether the LoRA updates should be merged into the original weights during evaluation


In [None]:
class Linear(nn.Linear, LoRALayer):
    # LoRA implemented in a dense layer
    def __init__(
        self, 
        in_features: int, 
        out_features: int, 
        r: int = 0, 
        lora_alpha: int = 1, 
        lora_dropout: float = 0.,
        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        merge_weights: bool = True,
        **kwargs
    ):
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
                           merge_weights=merge_weights)

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

### Constructor (__init__)
`fan_in_fan_out` : If `True`, flips the weight dims. Since the `pytorch` `nn.Linear` expected weight shape in (out,in) but for transformers the weight shape follows the convention (in, out).

#### Fully Connected Layer
![Ex fully connect layer](./figures/fully_connected_layer.png)

$$
\begin{align*}
    y = Wx
\end{align*}
$$

The dimension of input features is 7, and the dimension of output features is 5. Hence, the weight matrix shape dimension (5,7), each row corresponds to a neuron weights.


- Defined the dimensions of two trainable matrices `lora_A` and `lora_B`. 
- Defined the `scaling` and froze the original weight $W$ by `self.weight.requires_grad = False`. 
- Initialized the matrices $A$ and $B$ by `reset_parameters(self)`

    ```python
    def reset_parameters(self):
        nn.Linear.reset_parameters(self)
        if hasattr(self, 'lora_A'):
            # initialize B the same way as the default for nn.Linear and A to zero
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)
    ```
    Only if LoRA is active ($r>0$), then
    - Initialize `lora_A` using [Kaiming uniform initialization](https://arxiv.org/pdf/1502.01852)
      - The motivation of Kaiming uniform initialization: random weights drawn from Gaussian distributions with fixed standard deviations have difficulties to converge. 
    - $B$ is initialized to zeros 


In [None]:
def train(self, mode: bool = True):
    #Overrides the base train() method to handle LoRA weight merging/unmerging.
    def T(w):
        # handles the transposition if the base weight uses fan-in/fan-out layout
        return w.transpose(0, 1) if self.fan_in_fan_out else w
    # propagate mode to submodules
    nn.Linear.train(self, mode)
    
    if mode:
        # train mode
        if self.merge_weights and self.merged:
            # Make sure that the weights are not merged, since we merge for inference
            if self.r > 0:
                # substracted the wieghts 
                self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
            self.merged = False
    else:
        # eval mode
        if self.merge_weights and not self.merged:
            # Merge the weights and mark it
            if self.r > 0:
                self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
            self.merged = True      

In [None]:
import torch 
import torch.nn.functional as F


def forward(self, x: torch.Tensor):
    def T(w):
        return w.transpose(0, 1) if self.fan_in_fan_out else w
    if self.r > 0 and not self.merged:
        '''
        F.linear() : applies a linear transformation to input x
        result : output from the frozen pre-trained linear layer
        '''
        result = F.linear(x, T(self.weight), bias=self.bias)            
        result += (self.lora_dropout(x) 
                   @ self.lora_A.transpose(0, 1) 
                   @ self.lora_B.transpose(0, 1)) * self.scaling
        return result
    else:
        return F.linear(x, T(self.weight), bias=self.bias)

## Inject LoRA into Attention Projections

In [None]:
import clip
import torch
from loralib.layers import Linear as LoRALinear

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def add_lora_2_clip(model, r=4, alpha=8, dropout=0.0):
    for name, module in model.visual.transformer.named_modules():
        attn = module.attn
        for proj_name in ['q_proj', 'k_proj', 'v_proj']:
            orig = getattr(attn, proj_name, None)
            if isinstance(orig, torch.nn.Linear):
                lora = LoRALayer(
                    in_features = orig.in_features,
                    out_features = orig.out_features,
                    r=r,
                    lora_alpha=alpha,
                    lora_dropout=dropout,
                    fan_in_fan_out = False
                )
                lora.weights.data = orig.weight.data.clone()
                lora.bias = orig.bias
                setattr(attn, proj_name, lora)

def freeze_clip(model):
    for name, param in model.named_parameters():
        param.requires_grad = False
    
    for name, param in model.named_parameters():
        if "lora_" in name:
            param.required_grad = True




