# Using 🤗 PEFT & bitsandbytes to finetune a LoRa checkpoint




In [1]:
!pip install -q bitsandbytes datasets accelerate loralib -U
#!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install transformers peft ipywidgets -U

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
# Does work
import sys
sys.path.append('/home/jovyan/.local/bin')
sys.path.append('/home/jovyan/.local/lib/python3.8/site-packages')
import sys
display(sys.path)

['/workspace/pv-data/InternFolders/zenchang/FalconAI/Web_App/models',
 '/home/jovyan/.conda/envs/gpt/lib/python311.zip',
 '/home/jovyan/.conda/envs/gpt/lib/python3.11',
 '/home/jovyan/.conda/envs/gpt/lib/python3.11/lib-dynload',
 '',
 '/home/jovyan/.conda/envs/gpt/lib/python3.11/site-packages',
 '/home/jovyan/.local/bin',
 '/home/jovyan/.local/lib/python3.8/site-packages']

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-2e627182-d6e4-5bd2-0ce8-3e5611885128)


### Setup the model

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, GPTJForCausalLM

# model = AutoModelForCausalLM.from_pretrained(
#     "bigscience/bloom-7b1",
#     load_in_8bit=True,
#     device_map='auto',
# )

#tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")

model_name = 'EleutherAI/gpt-j-6B'
model = GPTJForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map='auto',
)



In [6]:
# Get tokenizer stuff
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("./models/tokenizer/")

('./models/tokenizer/tokenizer_config.json',
 './models/tokenizer/special_tokens_map.json',
 './models/tokenizer/vocab.json',
 './models/tokenizer/merges.txt',
 './models/tokenizer/added_tokens.json',
 './models/tokenizer/tokenizer.json')

### Freezing the original weights


In [7]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 7340032 || all params: 6058222816 || trainable%: 0.12115817167725645


## Data

In [10]:
import transformers
from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")

data = load_dataset("text", data_files={"train": "train.txt", "validation": "test.txt"})
display (data)
display(data['train'][0])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 51576
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 12894
    })
})

{'text': '<BOS> 2.82  RADAR HANDBOOK 6x9 Handbook  / Radar Handbook / Skolnik / 148547-3 / Chapter 2 When doppler shifts are introduced by digital means as described above, the accu - racy of the I and Q representation of the original input data becomes an important  consideration. Any dc offset, amplitude imbalance, quadrature phase error, or nonlin - earity will result in the generation of undesired sidebands that will appear as residue  at the canceler output. A discussion of A/D conversion considerations was presented  in Section 2.13. <EOS>'}

In [11]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

# data['train'] = data['train'].map(merge_columns)
# display(data['train'])

display(data['train'])
display(data['train']["text"][:5])


Dataset({
    features: ['text'],
    num_rows: 51576
})

['<BOS> 2.82  RADAR HANDBOOK 6x9 Handbook  / Radar Handbook / Skolnik / 148547-3 / Chapter 2 When doppler shifts are introduced by digital means as described above, the accu - racy of the I and Q representation of the original input data becomes an important  consideration. Any dc offset, amplitude imbalance, quadrature phase error, or nonlin - earity will result in the generation of undesired sidebands that will appear as residue  at the canceler output. A discussion of A/D conversion considerations was presented  in Section 2.13. <EOS>',
 '<BOS> This time delay, which istheobserved quantity, may then beregarded asameasure ofthe rate ofchange ofphase with frequency ofthereturned components. 5.4. Class ofSystems Considered.—In this chapter wewill consider that class ofsystems inwhich the modulation issuch that energy is emitted all, ornearly all,the time. <EOS>',
 '<BOS> \x17\x16\x11 \x00 0HYS\x0e\x003CI\x0e\x002ES\x0e \x000APER\x00\x12\x19\x10 \x00\x11\x19\x16\x16\x0e \x11\x16\x0e\x00

In [12]:
data['train'][0]

{'text': '<BOS> 2.82  RADAR HANDBOOK 6x9 Handbook  / Radar Handbook / Skolnik / 148547-3 / Chapter 2 When doppler shifts are introduced by digital means as described above, the accu - racy of the I and Q representation of the original input data becomes an important  consideration. Any dc offset, amplitude imbalance, quadrature phase error, or nonlin - earity will result in the generation of undesired sidebands that will appear as residue  at the canceler output. A discussion of A/D conversion considerations was presented  in Section 2.13. <EOS>'}

In [13]:
#data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)
##data = data.map(lambda samples: tokenizer(samples), batched=True)

data = data.map(lambda samples: tokenizer(samples['text']), batched=True)

In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 51576
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 12894
    })
})

### Training

In [16]:

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
#         max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = True # silence the warnings. Please re-enable for inference!
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,4.3544
2,3.2962
3,3.8343
4,3.9673
5,4.3279
6,3.3691
7,3.4706
8,3.9768
9,4.4882
10,3.8531


../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [300,0,0], thread: [0,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [300,0,0], thread: [1,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [300,0,0], thread: [2,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [300,0,0], thread: [3,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [300,0,0], thread: [4,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block:

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Save model to disk
trainer.save_model()

## Share adapters on the 🤗 Hub

In [None]:
# Get write token from huggingface
! python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"


In [None]:
# fine_tuned_model = GPTJForCausalLM.from_pretrained("./distilgptj-finetuned-radar")
model.push_to_hub("Fiery101/distilgptj-finetuned-radar", create_pr=1)
tokenizer.push_to_hub("Fiery101/distilgptj-finetuned-radar")
# model.push_to_hub("samwit/bloom-7b1-lora-tagger",
#                   use_auth_token=True,
#                   commit_message="basic training",
#                   private=True)

## Load adapters from the Hub

In [None]:
# import torch
# from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer

# peft_model_id = "Fiery101/distilgptj-finetuned-radar"
# config = PeftConfig.from_pretrained(peft_model_id)
# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# # Load the Lora model
# model = PeftModel.from_pretrained(model, peft_model_id)

## Inference

In [None]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))