Source code : 
https://huggingface.co/transformers/v4.3.3/_modules/transformers/models/roberta/modeling_roberta.html

![](images/LoRA.png)

In [None]:
class LoraRobertaSelfAttention(RobertaSelfAttention):
    """
    Inherits from the RobertaSelfAttention module.
    Creates low-rank matrices for the query and value.

    Parameters:
    - r (int): Rank for LoRA matrices.
    - config: Configuration of the Roberta Model.
    """
    def __init__(self, r=8, *args, **kwargs):
        super().__init__(*args, **kwargs)
        d = self.all_head_size  # all_head_size = dimension * number_attention_heads

        # Initialize LoRA matrices for query and value
        self.lora_query_matrix_B = nn.Parameter(torch.zeros(d, r))
        self.lora_query_matrix_A = nn.Parameter(torch.randn(r, d))
        self.lora_value_matrix_B = nn.Parameter(torch.zeros(d, r))
        self.lora_value_matrix_A = nn.Parameter(torch.randn(r, d))
    
    def lora_query(self, x):
        """
        Obtain the query matrix: original matrix + LoRA matrix. 
        The original matrix does not need to be updated.
        Wx --> Wx + BAx
        """
        lora_query_weights = torch.matmul(self.lora_query_matrix_B, self.lora_query_matrix_A)
        return self.query(x) + F.linear(x, lora_query_weights)
    
    def lora_value(self, x):
        """
        Obtain the value matrix: original matrix + LoRA matrix. 
        The original matrix does not need to be updated.
        """
        lora_value_weights = torch.matmul(self.lora_value_matrix_B, self.lora_value_matrix_A)
        return self.value(x) + F.linear(x, lora_value_weights)
    
    def forward(self, hidden_states, *args, **kwargs):
        """
        Only the self.query() and self.value() functions are affected, and need to be replaced.
        """
        # Original code for query:
        ## mixed_query_layer = self.query(hidden_states)
        # Updated query for LoRA:
        mixed_query_layer = self.lora_query(hidden_states)

        # The key has no LoRA, so leave these calls unchanged
        key_layer = self.transpose_for_scores(self.key(hidden_states))

        # Original code for value:
        ## value_layer = self.transpose_for_scores(self.value(hidden_states))
        # Updated value for LoRA:
        value_layer = self.transpose_for_scores(self.lora_value(hidden_states))
        
        # Other operations...


class LoraWrapperRoberta(nn.Module):
    def __init__(self, task_type, num_classes=None, dropout_rate=0.1, model_id="roberta-large",
                 lora_rank=8, train_biases=True, train_embedding=False, train_layer_norms=True):
        """
        RoBERTa Wrapper, requires replacing all self-attention layers in the original model.
        - task_type: Type of NLP task ('glue', 'squad_v1', 'squad_v2').
        - num_classes: Number of classes for classification (varies with task).
        - dropout_rate: Dropout rate in the model.
        - model_id: Pre-trained RoBERTa model ID.
        - lora_rank: Rank for LoRA adaptation.
        - train_biases, train_embedding, train_layer_norms: 
            Flags whether to keep certain parameters trainable 
            after initializing LoRA.
        
        Example:
            model = LoraWrapperRoberta(task_type='glue')
        """
        super().__init__()
        # 1. Initialize the base model with parameters
        self.model_id = model_id
        self.tokenizer = RobertaTokenizer.from_pretrained(model_id)
        self.model = RobertaModel.from_pretrained(model_id)
        self.model_config = self.model.config

        # 2. Add the layer for the benchmark tasks
        d_model = self.model_config.hidden_size
        self.finetune_head_norm = nn.LayerNorm(d_model)
        self.finetune_head_dropout = nn.Dropout(dropout_rate)
        self.finetune_head_classifier = nn.Linear(d_model, num_classes)

        # 3. Set up the LoRA model for training
        self.replace_multihead_attention()
        self.freeze_parameters_except_lora_and_bias()
        
    def replace_multihead_attention_recursion(self, model):
        """
        Replace RobertaSelfAttention in the model with LoraRobertaSelfAttention.
        This method applies the replacement recursively to all sub-components.

        Parameters
        ----------
        model : nn.Module
            The PyTorch module or model to be modified.
        """
        for name, module in model.named_children():
            if isinstance(module, RobertaSelfAttention):
                # Replace RobertaSelfAttention with LoraRobertaSelfAttention
                new_layer = LoraRobertaSelfAttention(r=self.lora_rank, config=self.model_config)
                new_layer.load_state_dict(module.state_dict(), strict=False)
                setattr(model, name, new_layer)
            else:
                # Recursive call for child modules
                self.replace_multihead_attention_recursion(module)
                
    def freeze_parameters_except_lora_and_bias(self):
        """
        Freeze some parameters based on the predefined configuration, 
        so they do not need to be trained during the fine-tuning phase.
        
        The parameters in the LoRA layers, the fine-tune head, bias parameters, 
        embeddings, and layer norms can be set as trainable parameters.
        """
        for name, param in self.model.named_parameters():
            is_trainable = (
                "lora_" in name or
                "finetune_head_" in name or
                (self.train_biases and "bias" in name) or
                (self.train_embeddings and "embeddings" in name) or
                (self.train_layer_norms and "LayerNorm" in name)
            )
            param.requires_grad = is_trainable


In [None]:
# LoraLinear is the smallest modifiable unit
class LoraLinear(nn.Linear):  # wx --> wx + BAx
    """
    Extends a PyTorch linear layer with Low-Rank Adaptation (LoRA).
    LoRA adds two matrices to the layer, allowing for efficient training of large models.
    """

    def __init__(self, in_features, out_features, r=8, *args, **kwargs):
        super().__init__(in_features, out_features, *args, **kwargs)

        # Initialize LoRA matrices
        self.lora_matrix_B = nn.Parameter(torch.zeros(out_features, r))
        self.lora_matrix_A = nn.Parameter(torch.randn(r, in_features))

        # Freeze the original weight matrix
        self.weight.requires_grad = False

    def forward(self, x: Tensor) -> Tensor:
        # Compute LoRA weight adjustment
        lora_weights = torch.matmul(self.lora_matrix_B, self.lora_matrix_A)
        # Apply the original and LoRA-adjusted linear transformations
        return super().forward(x) + F.linear(x, lora_weights)


# LoRA fine-tuning on flan-T5-xxl
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl", load_in_8bit=True, device_map="auto")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,  # \hat{W} = W + \alpha/r * \delta{W}
    target_modules=["q", "v"],  # Which matrices to learn in Q, K, V?
    lora_dropout=0.05,
    bias="none",  # Do not train the bias term
    task_type=TaskType.SEQ_2_SEQ_LM
)
# model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)

# The rest of the training process is the same as before!

model.print_trainable_parameters()
# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817
