In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [6]:
import transformers
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os

In [7]:
# Naming the run
model_name = 'microsoft/phi-2'
project = "Evol-Instruct-Code-80k-v1-10-precent"
base_model_name = model_name.split("/")[1]
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
device = 'cpu'

In [8]:
# Load the dataset
huggingface_dataset_name = "nickrosh/Evol-Instruct-Code-80k-v1"
# For the sake of this example, we will only use the first 10% of the training set
train_dataset = load_dataset(huggingface_dataset_name, split="train[:10%]")
print(train_dataset)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 7826
})


In [9]:
# Sadly, we can only train quantized model on GPU using BitsandBytes, which is a warper for CUDA I believe.
# For cpu, use float32.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    # This is the PR that adds gradient checkpointing. It is not merged yet. This kind of thing is the cost of being on the bleeding edge.
    revision="refs/pr/23",
)
model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]


PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )
    )
  )
  (lm

In [10]:
# Load and config the tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    use_fast=True,
)

# add special tokens for ChatML formatting and a pad token
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
# For the sake of memory and time, we will only use 64 tokens
tokenizer.model_max_length = 8
# resize model embeddings
model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   # phi2 default is 64, see configuration_phi.py
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Define the tokenizing function to tokenize the dataset
def tokenize_function(data_point):
    def get_prompt(inst):
        return f"""
        # System:
        You are a helpful AI assistant. Follow the instruction. 
        # INSTRUCTION:
        {inst}
        # CODE:
        """
    prompts = [get_prompt(inst) for inst in data_point['instruction']]
    data_point['input_ids'] = tokenizer(
        prompts,
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    data_point['labels'] = tokenizer(
        data_point['output'],
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding='max_length',
    ).input_ids
    return data_point


tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, num_proc=os.cpu_count())

In [12]:
# Apply the Peft adapter
config = LoraConfig(
    r=1,
    lora_alpha=16,
    target_modules=[
        'Wqkv', 'out_proj'
    ],
    bias="none",
    lora_dropout=0.05,
    # because we added new tokens
    modules_to_save=["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/heichtai/Projects/tutorial-finetune-LLM/.venv/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [13]:
# We can see the number of trainable parameters are only few percent of the original model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


print_trainable_parameters(model)

trainable params: 129325184 || all params: 2904420608 || trainable%: 4.4527016384536


In [14]:
# Define the trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=0,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        max_steps=100,
        learning_rate=2.5e-5,
        lr_scheduler_type="constant",
        logging_steps=50,
        dataloader_num_workers=4,
        optim="adamw",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        report_to="tensorboard",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, mlm=False),
)

ValueError: adamw is not a valid OptimizerNames, please select one of ['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_torch_npu_fused', 'adamw_apex_fused', 'adafactor', 'adamw_anyprecision', 'sgd', 'adagrad', 'adamw_bnb_8bit', 'adamw_8bit', 'lion_8bit', 'lion_32bit', 'paged_adamw_32bit', 'paged_adamw_8bit', 'paged_lion_32bit', 'paged_lion_8bit', 'rmsprop']

In [None]:
# Train!
trainer.train()
model.save_pretrained(output_dir)

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


NameError: name 'str2optimizer32bit' is not defined