In [1]:
from ct_model import DeltaModel, dispatch_default
from ct_bnb import Linear4bit, dispatch_bnb_4bit
from ct_layer import DeltaLayer
from ct_config import CTConfig
from ct_optim import BlockOptimizer
from prepare_data import gen_dataloader

from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

from tqdm import tqdm



In [2]:
device_id = 2

# Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
bnb_4bit_quant_type = "fp4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

model_id = "Qwen/Qwen1.5-0.5B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":device_id})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
config = CTConfig(
             r=32,
             delta_alpha=32,
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                             "gate_proj", "up_proj", "down_proj"],
             delta_dropout=0,
             init_lora_weights=False,
         )

In [4]:
# config = CTConfig(
#              r=32,
#              delta_alpha=32,
#              target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#              delta_dropout=0,
#              init_lora_weights=False,
#          )

In [5]:
delta_model = DeltaModel(model, config, "default")

In [6]:
base_optimizer = AdamW(model.parameters(), lr=1e-4)

In [7]:
optimizer = BlockOptimizer(base_optimizer,
                           list(delta_model.named_parameters()),
                           None,
                           10,
                           switch_mode="ascending",
                           model=delta_model)

Now init the block adapter ['model.model.layers.0.']
Parameters with the following prefix will be trainable: ['model.model.layers.0.']


In [8]:
train_dataloader = gen_dataloader(tokenizer)

Using the latest cached version of the module from /home/ubuntu/date/hf_cache/modules/datasets_modules/datasets/eli5_category/80106cc49322f1f5075e1387be4a5b74b95e0f56c40ff142b8999d0606aa1908 (last modified on Wed Jun  5 22:09:48 2024) since it couldn't be found locally at eli5_category, or remotely on the Hugging Face Hub.


Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [9]:
for step, batch in enumerate(tqdm(train_dataloader)):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    outputs = delta_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    if step % 5 ==0:
        print(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  0%|          | 1/800 [00:00<12:30,  1.07it/s]

tensor(3.1361, grad_fn=<ToCopyBackward0>)


  1%|          | 6/800 [00:01<03:08,  4.20it/s]

tensor(4.5185, grad_fn=<ToCopyBackward0>)


  1%|          | 8/800 [00:02<02:50,  4.64it/s]

Now init the block adapter ['model.model.layers.1.']
Parameters with the following prefix will be trainable: ['model.model.layers.1.']


  1%|▏         | 11/800 [00:03<04:39,  2.82it/s]

tensor(3.6131, grad_fn=<ToCopyBackward0>)


  2%|▏         | 16/800 [00:04<02:55,  4.46it/s]

tensor(3.3395, grad_fn=<ToCopyBackward0>)


  2%|▏         | 18/800 [00:05<02:46,  4.70it/s]

Now init the block adapter ['model.model.layers.2.']
Parameters with the following prefix will be trainable: ['model.model.layers.2.']


  3%|▎         | 21/800 [00:06<04:23,  2.96it/s]

tensor(3.1438, grad_fn=<ToCopyBackward0>)


  3%|▎         | 26/800 [00:07<02:53,  4.47it/s]

tensor(3.8793, grad_fn=<ToCopyBackward0>)


  4%|▎         | 28/800 [00:08<02:45,  4.68it/s]

Now init the block adapter ['model.model.layers.3.']
Parameters with the following prefix will be trainable: ['model.model.layers.3.']


  4%|▍         | 31/800 [00:09<04:15,  3.01it/s]

tensor(2.8666, grad_fn=<ToCopyBackward0>)


  4%|▍         | 36/800 [00:10<02:47,  4.56it/s]

tensor(3.0670, grad_fn=<ToCopyBackward0>)


  5%|▍         | 38/800 [00:11<02:38,  4.80it/s]

Now init the block adapter ['model.model.layers.4.']
Parameters with the following prefix will be trainable: ['model.model.layers.4.']


  5%|▌         | 41/800 [00:12<04:10,  3.04it/s]

tensor(3.7212, grad_fn=<ToCopyBackward0>)


  6%|▌         | 46/800 [00:13<02:46,  4.53it/s]

tensor(3.1999, grad_fn=<ToCopyBackward0>)


  6%|▌         | 48/800 [00:13<02:37,  4.79it/s]

Now init the block adapter ['model.model.layers.5.']
Parameters with the following prefix will be trainable: ['model.model.layers.5.']


  6%|▋         | 51/800 [00:15<04:08,  3.01it/s]

tensor(3.0427, grad_fn=<ToCopyBackward0>)


  7%|▋         | 56/800 [00:16<02:42,  4.58it/s]

tensor(2.4293, grad_fn=<ToCopyBackward0>)


  7%|▋         | 58/800 [00:16<02:34,  4.81it/s]

Now init the block adapter ['model.model.layers.6.']
Parameters with the following prefix will be trainable: ['model.model.layers.6.']


  8%|▊         | 61/800 [00:18<04:03,  3.03it/s]

tensor(2.6852, grad_fn=<ToCopyBackward0>)


  8%|▊         | 66/800 [00:19<02:41,  4.56it/s]

tensor(3.3130, grad_fn=<ToCopyBackward0>)


  8%|▊         | 68/800 [00:19<02:30,  4.85it/s]

Now init the block adapter ['model.model.layers.7.']
Parameters with the following prefix will be trainable: ['model.model.layers.7.']


  9%|▉         | 71/800 [00:21<03:59,  3.05it/s]

tensor(2.4889, grad_fn=<ToCopyBackward0>)


 10%|▉         | 76/800 [00:22<02:38,  4.58it/s]

tensor(3.0927, grad_fn=<ToCopyBackward0>)


 10%|▉         | 78/800 [00:22<02:28,  4.85it/s]

Now init the block adapter ['model.model.layers.8.']
Parameters with the following prefix will be trainable: ['model.model.layers.8.']


 10%|█         | 81/800 [00:23<03:52,  3.09it/s]

tensor(3.7633, grad_fn=<ToCopyBackward0>)


 11%|█         | 86/800 [00:24<02:33,  4.66it/s]

tensor(3.4722, grad_fn=<ToCopyBackward0>)


 11%|█         | 88/800 [00:25<02:25,  4.91it/s]

Now init the block adapter ['model.model.layers.9.']
Parameters with the following prefix will be trainable: ['model.model.layers.9.']


 11%|█▏        | 91/800 [00:26<03:46,  3.12it/s]

tensor(2.9079, grad_fn=<ToCopyBackward0>)


 12%|█▏        | 96/800 [00:27<02:31,  4.66it/s]

tensor(3.7587, grad_fn=<ToCopyBackward0>)


 12%|█▏        | 98/800 [00:28<02:24,  4.85it/s]

Now init the block adapter ['model.model.layers.10.']
Parameters with the following prefix will be trainable: ['model.model.layers.10.']


 13%|█▎        | 101/800 [00:29<03:45,  3.10it/s]

tensor(4.2594, grad_fn=<ToCopyBackward0>)


 13%|█▎        | 106/800 [00:30<02:28,  4.67it/s]

tensor(3.1296, grad_fn=<ToCopyBackward0>)


 13%|█▎        | 107/800 [00:30<03:19,  3.48it/s]


KeyboardInterrupt: 

In [10]:
delta_model

DeltaModel(
  (model): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151936, 1024)
      (layers): ModuleList(
        (0-9): 10 x Qwen2DecoderLayer(
          (self_attn): Qwen2SdpaAttention(
            (q_proj): lora.Linear4bit(
              (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (delta_dropout): ModuleDict(
                (default): Identity()
              )
              (delta_theta): ModuleDict()
              (delta_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=32, bias=True)
              )
              (delta_B): ModuleDict(
                (default): Linear(in_features=32, out_features=1024, bias=False)
              )
              (delta_embedding): ParameterDict()
            )
            (k_proj): lora.Linear4bit(
              (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (delta_dropout): ModuleDict(
              