#### LoRA

LoRA는 Lora-Rank Decomposition 방법을 사용하여, 기존의 모델 파라미터를 직접 업데이트하지 않고, **차이를 보정하는 작은 크기의 두 행렬**을 학습합니다. 

- 기존 모델 파라미터 고정
    - LoRA는 기존의 큰 파라미터 행렬을 freeze하고, 새로운 파라미터인 A와 B만 학습합니다. LoRA는 모델의 기존 파라미터를 재활용하면서, 필요한 적은 수의 추가 파라미터만 학습하는 방식입니다.
- 저차원 근사화
    - LoRA는 **저차원 행렬(Low-Rank Matrix)**을 사용하여 파라미터를 근사화합니다. 예를 들어, W = BA 형태로 분해하여, 고차원의 가중치 행렬을 두 개의 작은 행렬로 분해하고, 이 두 작은 행렬만 학습합니다. 여기서 A와 B는 low-rank 행렬이기 때문에, 고차원 행렬을 사용하는 것보다 훨씬 적은 메모리와 계산량을 소모하게 됩니다.

In [11]:
import torch
import torch.nn as nn

In [12]:
# 학습 가능한 파라미터 수 계산 함수
def count_trainable_params(model):
    return format(sum(p.numel() for p in model.parameters() if p.requires_grad), ",")

In [13]:
# Base Model
class SimpleLM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x)
        x = torch.relu(self.linear1(x))
        x = self.linear2(x)
        return x
    
    
vocab_size = 10000
hidden_size = 512

In [14]:
model = SimpleLM(vocab_size, hidden_size)

print(f"학습 가능한 파라미터 수 (기본 모델): {count_trainable_params(model)}")

학습 가능한 파라미터 수 (기본 모델): 10,512,656


In [15]:
model

SimpleLM(
  (embed): Embedding(10000, 512)
  (linear1): Linear(in_features=512, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=10000, bias=True)
)

In [22]:
# LoRA Model
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r = 4, alpha = 1.0):
        """
        in_features: 입력차원(d)
        out_features: 출력차원(d)
        r: LoRA에서 사용하는 low-rank 행렬의 rank값 (논문에선 4~8 값 사용)
        alpha: scaling factor (논문에선 rank값과 함께 LoRA의 학습 안정성을 위해 사용)
        """
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.r = r
        self.alpha = alpha
        # low-rank 행렬의 영향을 조절
        # scaling-factor의 수학적 역할
        # 모델의 기존 가중치와 변경된 가중치간의 비율을 적절히 맞추기 위함
        self.scaling = alpha / r
        
        # 기존 weight freeze
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.weight.requires_grad = False
        
        # 새로 학습할 A,B행렬 정의
        # ΔW = B @ A
        # A: (r x in_features), 초기값은 작은 값으로 초기화 (논문에선 0에 가까운 초기화 권장)
        self.A = nn.Parameter(torch.randn(r, in_features) * 0.01)
        # B: (out_features x r)
        self.B = nn.Parameter(torch.randn(out_features, r) * 0.01)
        
        self.bias = nn.Parameter(torch.zeros(out_features))
        
    def forward(self, x):
        """
        기존 W_o는 고정하고, ΔW = B @ A를 통해 weight를 보정
        최종적으로 W' = W_o + ΔW (scaling factor 곱함)
        
        f(x) = (W_o + ΔW) @ x + b
             = (W_o + scaling-factor * (B @ A)) @ x + b
        """
        lora_weight = self.weight + (self.B @ self.A) * self.scaling
        # 기존의 linear operation
        return nn.functional.linear(x , lora_weight, self.bias)
    
    def extra_repr(self):
        """torch.nn.Module에서 __repr__의 상세정보를 덧붙일 때 사용하는 전용 메서드"""
        return f"in_features = {self.in_features}, out_features = {self.out_features}, r = {self.r}, alpha = {self.alpha}"
    
class SimpleLMWithLoRA(nn.Module):
    def __init__(self, vocab_size, hidden_size, r = 4, alpha = 1.0):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.embed.weight.requires_grad = False
        self.lora_linear1 = LoRALinear(hidden_size, hidden_size, r, alpha)
        self.lora_linear2 = LoRALinear(hidden_size, vocab_size, r, alpha)
        
    def forward(self, x):
        x = self.linear2(torch.relu(self.linear1(self.embed(x))))
        return x

In [23]:
lora_model = SimpleLMWithLoRA(vocab_size, hidden_size, r = 4, alpha = 16)

print(f"학습 가능한 파라미터 수 (LoRA 모델델): {count_trainable_params(lora_model)}")

"""
Linear1
    - A: (r x in_features)  = (4 x 512)     = 2048
    - B: (out_features x r) = (512 x 4)     = 2048
    - bias: (out_features)                  = 512
    
Linear2:
    - A: (r x in_features)  = (4 x 512)     = 2048
    - B: (out_features x r) = (10000 x 4)   = 40000
    - bias: (out_featrues)                  = 10000
    
Trainable Parameters: {Linear1} (4 x 512) + (512 x 4) + 512 + \
                      {Linear2} (4 x 512) + (10000 x 4) + 1000
                      = 56,656
"""

학습 가능한 파라미터 수 (LoRA 모델델): 56,656


'\nLinear1\n    - A: (r x in_features)  = (4 x 512)     = 2048\n    - B: (out_features x r) = (512 x 4)     = 2048\n    - bias: (out_features)                  = 512\n\nLinear2:\n    - A: (r x in_features)  = (4 x 512)     = 2048\n    - B: (out_features x r) = (10000 x 4)   = 40000\n    - bias: (out_featrues)                  = 10000\n\nTrainable Parameters: {Linear1} (4 x 512) + (512 x 4) + 512 +                       {Linear2} (4 x 512) + (10000 x 4) + 1000\n                      = 56,656\n'

In [24]:
lora_model

SimpleLMWithLoRA(
  (embed): Embedding(10000, 512)
  (lora_linear1): LoRALinear(in_features = 512, out_features = 512, r = 4, alpha = 16)
  (lora_linear2): LoRALinear(in_features = 512, out_features = 10000, r = 4, alpha = 16)
)

### Peft - LoRA

In [37]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.05
)

In [38]:
lora_config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules={'q_proj', 'embed_tokens', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [100]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")

In [101]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [102]:
format(sum(p.numel() for p in model.parameters() if p.requires_grad), ",")

'331,196,416'

In [103]:
lora_model = get_peft_model(model, lora_config)



In [104]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): lora.Embedding(
            (base_layer): Embedding(50272, 512, padding_idx=1)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict()
            (lora_B): ModuleDict()
            (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 16x50272])
            (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 512x16])
            (lora_magnitude_vector): ModuleDict()
          )
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
          (project_out): Linear(in_features=1024, out_features=512, bias=False)
          (project_in): Linear(in_features=512, out_features=1024, bias=False)
          (layers): ModuleList(
            (0-23): 2

In [105]:
lora_model.print_trainable_parameters()

trainable params: 2,385,408 || all params: 333,581,824 || trainable%: 0.7151


In [106]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): lora.Embedding(
        (base_layer): Embedding(50272, 512, padding_idx=1)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict()
        (lora_B): ModuleDict()
        (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 16x50272])
        (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 512x16])
        (lora_magnitude_vector): ModuleDict()
      )
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=T

In [114]:
unloaded_model = lora_model.unload()

In [115]:
unloaded_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [116]:
id(model) == id(unloaded_model)

True