# Calculation of trainable parameters

In [1]:
import torch
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer
)

torch.cuda.empty_cache()

In [2]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model name
model_name = "Qwen/Qwen2.5-7B"

# Load the pre-trained model
print(f"Loading model {model_name}...")
# Config for 4 bit quantization
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Config for 8 bit quantization
nf8_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_use_double_quant=True,
    bnb_8bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,  # Use float16 for mixed precision training
    device_map="auto",  # Distribute the model automatically across GPUs
    # quantization_config=nf4_config,  # Use the bitsandbytes quantization NF4 config
    quantization_config=nf8_config,  # Use the bitsandbytes quantization NF8 config
)

model

Unused kwargs: ['bnb_8bit_quant_type', 'bnb_8bit_use_double_quant', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading model Qwen/Qwen2.5-7B...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear8bitLt(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((35

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,  # Scaling factor for LoRA updates
    lora_dropout=0.05,  # Dropout rate applied to LoRA layers
    r=64,  # Rank of the LoRA decomposition
    bias="none",  # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",  # Specify the task as causal language modeling
    target_modules=[  # Modules to apply LoRA to
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

Our base model is structured like this: \
Qwen2ForCausalLM(
  (model): Qwen2Model( \
    (embed_tokens): Embedding(152064, 3584) \
    (layers): ModuleList( \
      (0-27): 28 x Qwen2DecoderLayer( \
        (self_attn): Qwen2SdpaAttention( \
          (q_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=True) \
          (k_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True) \
          (v_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True) \
          (o_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=False) \
          (rotary_emb): Qwen2RotaryEmbedding() \
        ) \
        (mlp): Qwen2MLP( \
          (gate_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False) \
          (up_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False) \
          (down_proj): Linear8bitLt(in_features=18944, out_features=3584, bias=False) \
          (act_fn): SiLU() \
        ) \
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06) \
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06) \
      ) \
    ) \
    (norm): Qwen2RMSNorm((3584,), eps=1e-06) \
  ) \
  (lm_head): Linear(in_features=3584, out_features=152064, bias=False) \
)

We are training the following layers:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj


Let's calculate the number of parameters:

In [4]:
in_features = 3584
q_o_out_features = 3584
k_v_out_features = 512
gate_up_out_features = 18944
down_in_features = 18944
down_out_features = 3584

r = 64

LoRa applied to a m x n matrix constists on creating A and B matrices such that A dim = m x r and B dim = r x n

In [5]:
q_proj_params = in_features * r + r * q_o_out_features
k_proj_params = in_features * r + r * k_v_out_features
v_proj_params = in_features * r + r * k_v_out_features
o_proj_params = q_o_out_features * r + r * in_features
gate_proj_params = in_features * r + r * gate_up_out_features
up_proj_params = in_features * r + r * gate_up_out_features
down_proj_params = down_in_features * r + r * down_out_features

In [7]:
print(f"Calculating LoRA paremeters for rank {r}...")
print(f"q_proj: {in_features} x {r} + {r} x {q_o_out_features} = {q_proj_params}")
print(f"k_proj: {in_features} x {r} + {r} x {k_v_out_features} = {k_proj_params}")
print(f"v_proj: {in_features} x {r} + {r} x {k_v_out_features} = {v_proj_params}")
print(f"o_proj: {q_o_out_features} x {r} + {r} x {in_features} = {o_proj_params}")
print(f"gate_proj: {in_features} x {r} + {r} x {gate_up_out_features} = {gate_proj_params}")
print(f"up_proj: {in_features} x {r} + {r} x {gate_up_out_features} = {up_proj_params}")
print(f"down_proj: {down_in_features} x {r} + {r} x {down_out_features} = {down_proj_params}")
print()
print(f"This transformer layer is applied 28 times in the model, so the total number of parameters is:")
total_params = 28 * (q_proj_params + k_proj_params + v_proj_params + o_proj_params + gate_proj_params + up_proj_params + down_proj_params)
print(f"{total_params}")

Calculating LoRA paremeters for rank 64...
q_proj: 3584 x 64 + 64 x 3584 = 458752
k_proj: 3584 x 64 + 64 x 512 = 262144
v_proj: 3584 x 64 + 64 x 512 = 262144
o_proj: 3584 x 64 + 64 x 3584 = 458752
gate_proj: 3584 x 64 + 64 x 18944 = 1441792
up_proj: 3584 x 64 + 64 x 18944 = 1441792
down_proj: 18944 x 64 + 64 x 3584 = 1441792

This transformer layer is applied 28 times in the model, so the total number of parameters is:
161480704
