# PEFT: Parameter-Efficient Fine-Tuning of Billion-Scale Models on Low-resource Hardware

LoRA Paper: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/pdf/2106.09685.pdf)  
HugginFace Blog: [PEFT](https://huggingface.co/blog/peft)  
My Colab: [LoRA](https://colab.research.google.com/drive/1zEpRSszKeIb7K2i0cDPnc5xHvIm_b4pD#scrollTo=a3axb36deU8E)


In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pypr

In [None]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-51305440-1252-7999-ae53-632896453c27)


# Setup the model

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICE"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model_name = "bigscience/bloom-7b1"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

# Freezing the original weights and casting to fp32 for stability

In [None]:
for param in model.parameters():
  param.requires_grad = False # freeze the model - train adapter later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable() # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

# Setting up the LoRA adapter

In [None]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()

  print(
      f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
  )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention head
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or seq2seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 7864320 || all params: 7076880384 || trainable%: 0.11112693126452029


# Data

In [None]:
import transformers
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
data["train"][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [None]:
def merge_columns(example):
  example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
  return example

data["train"] = data["train"].map(merge_columns)
data["train"]["prediction"][:5]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

## Tokenizer

In [None]:
data = data.map(lambda samples: tokenizer(samples["prediction"]), batched=True)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [None]:
# Checking attention mask

# for e in data["train"]["attention_mask"][0:3]:
#   print(e, sep=' ', end='\n')

# Training

In [None]:
training_argumnets = transformers.TrainingArguments(
    per_device_train_batch_size=4, # this means for one GPU the numer of batch size is 4 * 4 = 16
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=200,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=140,
    output_dir="outputs"
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=training_argumnets,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False # silence the warning. Please re-enable for inference!
trainer.train()

Step,Training Loss
140,2.0202


TrainOutput(global_step=200, training_loss=1.999678497314453, metrics={'train_runtime': 727.4499, 'train_samples_per_second': 4.399, 'train_steps_per_second': 0.275, 'total_flos': 1.3312956044279808e+16, 'train_loss': 1.999678497314453, 'epoch': 1.28})

In [None]:
# if getting system recursion limit error
# import sys

# print(sys.getrecursionlimit())
# sys.setrecursionlimit(10000)

1000


# Inference

In [None]:
batch = tokenizer("“In the interviews, be yourself” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 “In the interviews, be yourself” ->:  ['interviews', 'self-confidence', 'self-esteem', 'self-identity', 'self-image', 'self-reliance', 'self-worth', 'self-validation']
            ['interviews'


# Refrences
- https://huggingface.co/blog/peft
- https://huggingface.co/docs/peft/conceptual_guides/lora
- [Sam Witteveen](https://www.youtube.com/watch?v=Us5ZFp16PaU), and his [Colab](https://colab.research.google.com/drive/14xo6sj4dARk8lXZbOifHEn1f_70qNAwy?usp=sharing#scrollTo=MDqJWba-tpnv)


# Further Readings
- https://medium.com/@rajatsharma_33357/fine-tuning-llama-using-lora-fb3f48a557d5
- https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms
- https://huggingface.co/docs/peft/conceptual_guides/lora