Use PEFT and bitsandbytes to finetune LoRa checkpoint.

Reference:
- https://www.youtube.com/watch?v=Us5ZFp16PaU

In [2]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q transformers peft
# !pip install -q --upgrade transformers
# !pip install -q --upgrade vllm

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3050 Laptop GPU (UUID: GPU-46fda6e2-6c9b-ad69-0bfe-63fbe6f90e3b)


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

current_device = torch.cuda.current_device()
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={'':current_device}
)


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
print(f"Total device memory: \t{torch.cuda.get_device_properties(current_device).total_memory/(1024**3):.2f} GB")
print(f"Allocated for tensors: \t{torch.cuda.memory_allocated(current_device)/(1024**3):.2f} GB")
print(f"Total reserved memory: \t{torch.cuda.memory_reserved(current_device)/(1024**3):.2f} GB")

Total device memory: 	4.00 GB
Allocated for tensors: 	5.31 GB
Total reserved memory: 	5.40 GB


# freezing the original weights

In [5]:
for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        # cast small parameters(e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)


model.gradient_checkpointing_enable()  # reduce the number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

# Setting up LoRa adapters

In [6]:
def print_trainable_parameters(model):
    trainaible_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainaible_params += param.numel()
    print(f"trainable params: {trainaible_params} out of {all_param} \t trainable%: {100 * trainaible_params/all_param}")

In [16]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,  # attention heads
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 3407872 out of 4544008192 	 trainable%: 0.07499704789264605


# Data

In [8]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")

In [9]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

data['train'] = data['train'].map(merge_columns)
data['train']["prediction"][:5]

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

In [10]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator'],
 'prediction': "“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']"}

In [11]:
data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [12]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

# Train

In [17]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Re-enable for inference
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 3.068, 'grad_norm': 2.308359146118164, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


# Share adapters on Hub

In [None]:
model.push_to_hub(
    f"1000EquilibriumsEqualsChaos/{model_id}_test",
    use_auth_token=True,
    commit_message="Testing as it is.",
    private=True
)

References:
- https://colab.research.google.com/drive/14xo6sj4dARk8lXZbOifHEn1f_70qNAwy?usp=sharing