# `transformers` meets `bitsandbytes` for democratzing Large Language Models (LLMs) through 4bit quantization

QLora Training on Social Media Dataset

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install git+https://github.com/stanfordnlp/pyreft.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/stanfordnlp/pyreft.git
  Cloning https://github.com/stanfordnlp/pyreft.git to /tmp/pip-req-build-641gnwm1
  Running command git clone --filter=blob:none --quiet https://github.com/stanfordnlp/pyreft.git /tmp/pip-req-build-641gnwm1
  Resolved https://github.com/stanfordnlp/pyreft.git to commit b4c82d363494eee0560d74c1be2efc931f5f7045
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25ldone








load the model - GPT-neo-x-20B (Note that the model itself is around 40GB in half precision)

In [None]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from pyreft import get_reft_model, ReftConfig, LoreftIntervention, ReftTrainerForCausalLM

# Step 1: Load the base model and tokenizer
model_name = "EleutherAI/gpt-neox-20B"  # Replace with the desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/60.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/46 [00:00<?, ?it/s]

model-00001-of-00046.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

model-00002-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00003-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Load Dataset and Map

In [None]:
from datasets import load_dataset
import datasets

dataset1 = load_dataset('json', data_dir='/home/paperspace/trainingModel/modelDataset/ExtraDataProcessed', split='train')
dataset2 = load_dataset('json', data_dir='/home/paperspace/trainingModel/modelDataset/MastodonProcessed', split='train')

data = datasets.concatenate_datasets([dataset1, dataset2])

def preprocess_function(examples):
    return tokenizer(examples["content"], text_target=examples["content"], truncation=True, padding="max_length", max_length=128)

data = data.map(preprocess_function, batched=True)

Resolving data files:   0%|          | 0/60 [00:00<?, ?it/s]

Map:   0%|          | 0/1253762 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
# Step 3: Set up the ReFT configuration
reft_config = ReftConfig(representations={
    "layer": 19,
    "component": "block_output",
    "low_rank_dimension": 4,
    "intervention": LoreftIntervention(embed_dim=model.config.hidden_size, low_rank_dimension=4)
})

# Step 4: Get the ReFT model
reft_model = get_reft_model(model, reft_config, set_device=False)

Run the cell below to run the training

In [None]:
# Step 5: Set up the training arguments
training_args = transformers.TrainingArguments(
    output_dir="outputs_reft",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    max_steps=500,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=50
)

# Step 6: Create the trainer
trainer = ReftTrainerForCausalLM(
    model=reft_model,
    args=training_args,
    train_dataset=data,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Step 7: Train the model
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['labels']

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained("outputs_reft")

In [None]:
reft_config = pyreft.ReftConfig.from_pretrained('outputs_reft')
model = pyreft.get_reft_model(model, reft_config)

Model Question Input

In [None]:
text = "Is bitcoin a good cryptocurrency?"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))