# `transformers` meets `bitsandbytes` for democratzing Large Language Models (LLMs) through 4bit quantization

QLora Training on Social Media Dataset

In [None]:
!pip install --upgrade pip
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

load the model - GPT-neo-x-20B (Note that the model itself is around 40GB in half precision)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Load Dataset and Map

In [None]:
from datasets import load_dataset
import datasets

dataset1 = load_dataset('json', data_dir='/home/paperspace/trainingModel/modelDataset/ExtraDataProcessed', split='train')
dataset2 = load_dataset('json', data_dir='/home/paperspace/trainingModel/modelDataset/MastodonProcessed', split='train')

data = datasets.concatenate_datasets([dataset1, dataset2])

def preprocess_function(examples):
    return tokenizer(examples["content"], text_target=examples["content"], truncation=True, padding="max_length", max_length=128)

data = data.map(preprocess_function, batched=True)

Run the cell below to run the training

In [None]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        max_steps=500,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=50,
        output_dir="path/to/save/model",
        optim="paged_adamw_8bit",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.gradient_checkpointing_enable()  # Removed the use_reentrant argument
model.config.use_cache = False

trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("path/to/save/model")

In [None]:
lora_config = LoraConfig.from_pretrained('path/to/save/model')
model = get_peft_model(model, lora_config)

Model Question Input

In [None]:
text = "Your question here"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))