In [1]:
from ct_model import DeltaModel, dispatch_default
from ct_bnb import Linear4bit, dispatch_bnb_4bit
from ct_layer import DeltaLayer
from ct_config import CTConfig
from block_optim import BlockOptimizer
from prepare_data import gen_dataloader

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch



In [2]:
import os
os.environ['HF_HOME'] = '/home/ubuntu/date/hf_cache'

os.environ['HTTP_PROXY'] = "http://10.24.59.12:7890"
os.environ['HTTPS_PROXY'] = "http://10.24.59.12:7890"


In [3]:
device_id = 2

# Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
bnb_4bit_quant_type = "fp4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

model_id = "Qwen/Qwen1.5-0.5B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":device_id})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
config = CTConfig(
             r=32,
             delta_alpha=1,
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                             "gate_proj", "up_proj", "down_proj"],
             delta_dropout=0,
             init_lora_weights=False,
         )

In [5]:
delta_model = DeltaModel(model, config, "default")

In [8]:
def apply_to_innermost_layers(module, operation, prefix=''):
    """
    Recursively traverse all submodules of a model and apply a specific operation
    to the innermost layers, passing the full name of the layer.
    """
    has_children = False
    for name, layer in module.named_children():
        has_children = True
        new_prefix = f"{prefix}.{name}" if prefix else name
        apply_to_innermost_layers(layer, operation, new_prefix)
    
    # If the module has no children, it's an innermost layer
    if not has_children:
        
        # operation(module, prefix)
        # here we can get the name info
        inner_module = prefix.split(".")[-1] # like delta_theta, delta_A
        print(prefix.split(".")[-1])

In [9]:
apply_to_innermost_layers(delta_model, None)

embed_tokens
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
rotary_emb
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
act_fn
input_layernorm
post_attention_layernorm
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
rotary_emb
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
base_layer
default
delta_theta
delta_A
delta_B
delta_embedding
act_fn
input_layernorm
post_attenti

In [10]:
for n, p in delta_model.named_parameters():
    if "lora" in n:
        print(n)

In [11]:
def infer_param_groups(named_parameters_list, include_embedding=False, include_lm_head=False):
    """automatic inference of the parameter groups based on the parameter names.
    divide groups into:
        * embedding
        * transformer layers
        * lm_head and others
    """
    import re
    
    block_prefix_list = []
    lm_head_and_other_params = []
    embed_pattern = r'.*embed[^.]*\.'
    layer_pattern = r'.*layers.[^.]*\.'

    param_names = [n for n, _ in named_parameters_list]
    for name in param_names:
        if any(prefix[0] in name for prefix in block_prefix_list):
            continue
        
        if re.findall(layer_pattern, name):
            block_prefix_list.append(re.findall(layer_pattern, name))
        elif re.findall(embed_pattern, name) and include_embedding:
            block_prefix_list.append(re.findall(embed_pattern, name))
        else:
            lm_head_and_other_params.append(name)
    
    if include_lm_head:
        block_prefix_list.append(lm_head_and_other_params)
    
    return block_prefix_list

In [12]:
infer_param_groups(list(delta_model.named_parameters()))

[['model.model.layers.0.'],
 ['model.model.layers.1.'],
 ['model.model.layers.2.'],
 ['model.model.layers.3.'],
 ['model.model.layers.4.'],
 ['model.model.layers.5.'],
 ['model.model.layers.6.'],
 ['model.model.layers.7.'],
 ['model.model.layers.8.'],
 ['model.model.layers.9.'],
 ['model.model.layers.10.'],
 ['model.model.layers.11.'],
 ['model.model.layers.12.'],
 ['model.model.layers.13.'],
 ['model.model.layers.14.'],
 ['model.model.layers.15.'],
 ['model.model.layers.16.'],
 ['model.model.layers.17.'],
 ['model.model.layers.18.'],
 ['model.model.layers.19.'],
 ['model.model.layers.20.'],
 ['model.model.layers.21.'],
 ['model.model.layers.22.'],
 ['model.model.layers.23.']]

In [15]:
model.model.layers[0].self_attn.q_proj

lora.Linear4bit(
  (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
  (delta_dropout): ModuleDict(
    (default): Identity()
  )
  (delta_theta): ModuleDict()
  (delta_A): ModuleDict()
  (delta_B): ModuleDict()
  (delta_embedding): ParameterDict()
)

In [18]:
prefix = "model.model.layers.0.self_attn.q_proj.delta_A"

In [22]:
layer_num = prefix.split(".")[3]
layer_prefix = ".".join(prefix.split(".")[:3])
attention_name = ".".join(prefix.split(".")[-3:-1])
the_layer = eval( f"{layer_prefix}[{layer_num}].{attention_name}" )

In [23]:
the_layer

lora.Linear4bit(
  (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
  (delta_dropout): ModuleDict(
    (default): Identity()
  )
  (delta_theta): ModuleDict()
  (delta_A): ModuleDict()
  (delta_B): ModuleDict()
  (delta_embedding): ParameterDict()
)

In [13]:
"model.model.layers.0." in "model.model.layers.0.self_attn.q_proj.base_layer"

True

In [7]:
dataloader = gen_dataloader(tokenizer)

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
delta_model

DeltaModel(
  (model): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151936, 1024)
      (layers): ModuleList(
        (0-23): 24 x Qwen2DecoderLayer(
          (self_attn): Qwen2SdpaAttention(
            (q_proj): lora.Linear4bit(
              (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (delta_dropout): ModuleDict(
                (default): Identity()
              )
              (delta_theta): ModuleDict()
              (delta_A): ModuleDict()
              (delta_B): ModuleDict()
              (delta_embedding): ParameterDict()
            )
            (k_proj): lora.Linear4bit(
              (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (delta_dropout): ModuleDict(
                (default): Identity()
              )
              (delta_theta): ModuleDict()
              (delta_A): ModuleDict()
              (delta_B): ModuleDict()
              (delta_embedding):

In [23]:
def apply_to_innermost_layers(module, operation, prefix=''):
    """
    Recursively traverse all submodules of a model and apply a specific operation
    to the innermost layers, passing the full name of the layer.
    """
    has_children = False
    for name, layer in module.named_children():
        has_children = True
        new_prefix = f"{prefix}.{name}" if prefix else name
        apply_to_innermost_layers(layer, operation, new_prefix)
    
    # If the module has no children, it's an innermost layer
    if not has_children:
        # operation(module, prefix)
        print(prefix)

# Define the operation you want to perform on each innermost layer
# def my_operation(layer, layer_name):
#     print(f"Applying operation to innermost layer: {layer_name} ({type(layer)})")
#     # Example operation: print layer weights if it's a Linear layer
#     if isinstance(layer, nn.Linear):
#         print(f"Layer weights: {layer.weight}")

In [24]:
apply_to_innermost_layers(delta_model, None)

model.model.embed_tokens
model.model.layers.0.self_attn.q_proj.base_layer
model.model.layers.0.self_attn.q_proj.delta_dropout.default
model.model.layers.0.self_attn.q_proj.delta_theta
model.model.layers.0.self_attn.q_proj.delta_A
model.model.layers.0.self_attn.q_proj.delta_B
model.model.layers.0.self_attn.q_proj.delta_embedding
model.model.layers.0.self_attn.k_proj.base_layer
model.model.layers.0.self_attn.k_proj.delta_dropout.default
model.model.layers.0.self_attn.k_proj.delta_theta
model.model.layers.0.self_attn.k_proj.delta_A
model.model.layers.0.self_attn.k_proj.delta_B
model.model.layers.0.self_attn.k_proj.delta_embedding
model.model.layers.0.self_attn.v_proj.base_layer
model.model.layers.0.self_attn.v_proj.delta_dropout.default
model.model.layers.0.self_attn.v_proj.delta_theta
model.model.layers.0.self_attn.v_proj.delta_A
model.model.layers.0.self_attn.v_proj.delta_B
model.model.layers.0.self_attn.v_proj.delta_embedding
model.model.layers.0.self_attn.o_proj.base_layer
model.model