In [1]:
from ct_model import DeltaModel, dispatch_default
from ct_bnb import Linear4bit, dispatch_bnb_4bit
from ct_layer import DeltaLayer
from ct_config import CTConfig
from ct_optim import BlockOptimizer
from prepare_data import gen_dataloader

from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

from tqdm import tqdm



In [2]:
import os
os.environ['HF_HOME'] = '/home/ubuntu/date/hf_cache'

os.environ['HTTP_PROXY'] = "http://10.24.59.12:7890"
os.environ['HTTPS_PROXY'] = "http://10.24.59.12:7890"


In [3]:
device_id = 2

# Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
bnb_4bit_quant_type = "fp4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

model_id = "Qwen/Qwen1.5-0.5B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":device_id})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
config = CTConfig(
             r=32,
             delta_alpha=1,
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                             "gate_proj", "up_proj", "down_proj"],
             delta_dropout=0,
             init_lora_weights=False,
         )

In [5]:
delta_model = DeltaModel(model, config, "default")

In [6]:
# train_dataloader = gen_dataloader()

In [7]:
base_optimizer = AdamW(model.parameters(), lr=1e-4)

In [8]:
optimizer = BlockOptimizer(base_optimizer,
                           list(delta_model.named_parameters()),
                           None,
                           switch_mode="ascending",
                           model=delta_model)

Now init the block adapter ['model.model.layers.0.']
Parameters with the following prefix will be trainable: ['model.model.layers.0.']


In [9]:
# for step, batch in enumerate(tqdm(train_dataloader)):
#     input_ids = batch["input_ids"]
#     attention_mask = batch["attention_mask"]
#     labels = batch["labels"]

#     outputs = delta_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#     loss = outputs.loss

#     if step % 20 ==0:
#         print(loss)

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()