# Quantized LLM Fine-Tuning Test (QLoRA)

Source: https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face

# Load the model
Phi-3 Mini 4K Instruct (3.8B parameters)

In [2]:
import bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os

# QLoRA quantisation configuration.
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4", # QLoRA,
    bnb_4bit_use_double_quant = True, # QLoRA,
    bnb_4bit_compute_dtype = torch.float32
)

model_id = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device="cuda:0"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# How many Mb of RAM is the model using?
print(model.get_memory_footprint()/1e6)

# Set up Low-Rank Adapters (LoRA)
Low-rank adapters will be attached to every quantized layer.
These adapters will be modifiable with training, but the layers they are attached to will be frozen.
The adapters take up about 1% of the size of the original layers, which dramatically reduces the size of the fine-tuned model checkpoints.

In [None]:
import peft

# Improve the numerical stability of our model during training
# by turning every non-quantized layer to full precision (FP32)
model = peft.prepare_model_for_kbit_training(model)

config = peft.LoraConfig(
    # The rank of the adapter, or the number of trainable fine-tuning parameters.
    r = 8,
    lora_alpha = 16, # Multiplier, usually 2*r
    bias = "none",
    lora_dropout = 0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require 
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

model = peft.get_peft_model(model, config)

# How many Mb of RAM is the model using?
print(model.get_memory_footprint()/1e6)

# Load the supervised dataset
For this toy example we will be loading an English-to-Yoda-speak dataset.

dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

In [None]:
# Convert dataset to SFTTrainer compatible form

dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])

# Supervised Fine-Tuning with SFTTrainer

In [None]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False}, 
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,  
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16, 
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',       
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model = model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset
)

In [None]:
trainer.train() # Should take ~30m

# Query the new model

In [None]:

def generate_prompt(tokenizer, text):
    """
        Assemble a message in the conversational format 
        and apply the chat template to it,
        appending the generation prompt to its end.
    """
    
    converted_sample = [{"role":"user", "content":text}]
    prompt = tokenizer.apply_chat_template(
        converted_sample, tokenize=False, add_generation_prompt=True
    )
    return prompt

In [None]:
def generate(model, tokenizer, prompt, max_new_tokens = 64, skip_special_tokens=False):
    tokenized_input= tokenizer(
        prompt, add_special_tokens=False, return_tensors = "pt"
    ).to(model.device)

    model.eval() # Don't modify my weights please

    gen_output = model.generate(
        **tokenized_input,
        eos_token_id = tokenizer.eos_token_id,
        max_new_tokens = max_new_tokens
    )

    output = tokenizer.batch_decode(
        gen_output,
        skip_special_tokens = skip_special_tokens,
    )

    return output[0]

In [None]:
generate(
    model, tokenizer,
    generate_prompt("I must destroy your cheese.")
)

# Save the fine-tuned model.

In [None]:
trainer.save_model("local-phi3-mini-yoda-adapter")