# Fine-tune Llama 2

Inspired by Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).


In [1]:
!pip install -q accelerate==0.21.0 transformers==4.31.0 trl==0.4.7 peft==0.4.0 bitsandbytes==0.40.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/244.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline,logging,

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
# Llama-2 model from the Hugging Face hub
model_name = "daryl149/llama-2-7b-chat-hf"

In [42]:
# We will use Guanaco-llama2 dataset
dataset = load_dataset("mlabonne/guanaco-llama2", split="train")

In [5]:
# dataset['text']

In [6]:
def load_tokenizer(model_name):
  """
  Loads pretrained model tokenizers

  Parameters:
  ----------
    model_name  : Model to load tokenizers

  Returns:
  ----------
    Returns pretrained model tokenizer

  """

  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"
  return tokenizer

In [7]:
def create_bnb_config_4bits(load_in_4bit=True,
                            bnb_4bit_quant_type="nf4",
                            bnb_4bit_compute_dtype="float16",
                            bnb_4bit_use_double_quant=False,
                            ):
  """
  Creates bits and bytes configuration for model quantization

  Parameters:
  ----------
    load_in_4bit              : Loading model in 4 bits. Bool, optional, defaults to True
    bnb_4bit_quant_type       : Sets the quantization data type. fp4 or nf4 (Normal Float 4)
    bnb_4bit_compute_dtype    : Sets the computational type
    bnb_4bit_use_double_quant : Nested quantization where the quantization constants from the first quantization are quantized again.

  Returns:
  ----------
    Returns BitsAndBytesConfig object

  """

  bnb_config = BitsAndBytesConfig(
                                    load_in_4bit=load_in_4bit,
                                    bnb_4bit_quant_type=bnb_4bit_quant_type,
                                    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
                                    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
                                )

  return bnb_config

In [9]:
def load_base_model(model_name,
                    bnb_config,
                    device_map={"": 0},
                    use_cache=False,
                    llama_tp=1):
  """
  Loads the base model object for fine tuning

  Parameters:
  ----------
    model_name  : modle checkpoint that we want too fine tune
    bnb_config  : bitsandbytes parameters (BitsAndBytesConfig) to Quantize the model to four bits,
    device_map  : Device map will map our device to Cuda device
    use_cache   : Default False, to make sure the model is on trainig mode
    llama_tp    : Specific to LLama. To ensure that we will get smilar results from our fine tuning

  Returns:
  ----------
    Base Model object
  """

  model = AutoModelForCausalLM.from_pretrained(
                                                  model_name,
                                                  quantization_config=bnb_config,
                                                  device_map=device_map
                                              )
  model.config.use_cache = False

  model.config.pretraining_tp = 1

  return model

In [10]:
def get_lora_config(r_lora=64, alpha=16, dropout_prob=0.05, bias_lora='none', task="CAUSAL_LM"):
  """
  Creates LoraConfig object for model model Quantization

  Parameters:
  ----------
    r_lora        : Lower rank results in smaller update matrices with fewer trainable parameters. LoRA attention dimension
    alpha         :  LoRA scaling either 16 or 32
    dropout_prob  : # Dropout probability for LoRA layers
    bias_lora     : Specifies if the bias parameters should be trained. Can be 'none', 'all' or 'lora_only'
    task          : Specify the tasy type of the model

  Returns :
  ----------
    Returns LORAConfig objects
  """

  return LoraConfig(
                        r=r_lora,
                        lora_alpha=alpha,
                        lora_dropout= dropout_prob,
                        bias=bias_lora,
                        task_type=task,
                    )


In [11]:
def training_arguments(output_dir="./llama2_finetuned",
                       per_device_train_batch_size = 8,
                       gradient_accumulation_steps = 4,
                       max_grad_norm = 0.3,
                       model_optimizer = "paged_adamw_8bit",
                       weight_decay=0.001,
                       learning_rate = 2e-4,
                       lr_scheduler_type = "linear",
                       num_train_epochs = 1,
                       max_steps = 20,
                       save_strategy = "epoch",
                       save_steps = 10,
                       logging_steps = 10,
                       fp16 = True,
                       bf16 = False,
                       gradient_checkpointing = True,
                       group_by_length = True,
                       push_to_hub = False
                       ):
  """
  Sets training parameters

  Parameters  :
  ----------
    output_dir                  : Sets output directory where the model predictions and checkpoints will be saved
    per_device_train_batch_size : Batch size per GPU for evaluation
    gradient_accumulation_steps : Number of update steps to accumulate the gradients for
    max_grad_norm               : Maximum gradient normal (gradient clipping)
    model_optimizer             : Optimizer to fine tune the model
    weight_decay                : Weight decay to apply to all layers except bias/LayerNorm weights
    learning_rate               : Learnign rate for optimizer
    lr_scheduler_type           : Learning rate schedule. Can take linear, constant or cosine
    num_train_epochs            : Number of training epochs
    max_steps                   : Number of training steps (overrides num_train_epochs)
    save_strategy               : The checkpoint save strategy to adopt during training. Possible values are: "no", "epoch", "steps"
    save_steps                  : Save checkpoints steps
    logging_steps               : Log updates steps
    fp16                        : Enable fp16 training
    bf16                        : Enable bf16 training
    gradient_checkpointing      : Enable gradient checkpointing
    group_by_length             : Group sequences into batches with same length
    push_to_hub                 : Push a quantized model on the Hub

  Returns :
  ----------
    Returns TrainingArguments objects
  """

  return TrainingArguments(
                            output_dir=output_dir,
                            per_device_train_batch_size=per_device_train_batch_size,
                            gradient_accumulation_steps=gradient_accumulation_steps,
                            max_grad_norm=max_grad_norm,
                            optim=model_optimizer,
                            weight_decay=weight_decay,
                            learning_rate=learning_rate,
                            lr_scheduler_type=lr_scheduler_type,
                            num_train_epochs=num_train_epochs,
                            max_steps=max_steps,
                            save_strategy = save_strategy,
                            save_steps=save_steps,
                            logging_steps=logging_steps,
                            fp16=fp16,
                            bf16=bf16,
                            gradient_checkpointing=gradient_checkpointing,
                            group_by_length=group_by_length,
                            push_to_hub=push_to_hub
                        )

In [12]:
def set_supervised_finetuning_parameters( model,
                                          lora_config,
                                          tokenizer,
                                          trainig_args,
                                          train_dataset=dataset,
                                          dataset_text_field="text",
                                          max_seq_length=None,
                                         ):
  """
  Sets supervised fine-tuning parameters

  Parameters  :
  ----------
    model              : Model object to fine tune
    train_dataset      : train dataset for fine tuning the model
    dataset_text_field : text feature of the train dataset
    lora_config        : LoRA confugarations
    max_seq_length     : Maximum sequence length to use
    tokenizer          : Load model tokenizer
    trainig_args       : Load training aguments

  Returns :
  ----------
    Returns SFTTrainer objects for supervised fine-tuning
  """

  return SFTTrainer(
                      model=model,
                      train_dataset=train_dataset,
                      peft_config=lora_config,
                      dataset_text_field=dataset_text_field,
                      max_seq_length=max_seq_length,
                      tokenizer=tokenizer,
                      args=trainig_args,
                      packing=False
                  )


In [13]:
tokenizer = load_tokenizer(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [14]:
bnb_config = create_bnb_config_4bits(load_in_4bit=True,
                            bnb_4bit_quant_type="nf4",
                            bnb_4bit_compute_dtype="float16",
                            bnb_4bit_use_double_quant=False,
                            )

In [15]:
lora_config = get_lora_config(r_lora=64, alpha=16, dropout_prob=0.05, bias_lora='none', task="CAUSAL_LM")

In [16]:
model = load_base_model(model_name,
                    bnb_config,
                    device_map={"": 0},
                    use_cache=False,
                    llama_tp=1)

Downloading (…)lve/main/config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [17]:
training_arguments = training_arguments(output_dir="./llama2_finetuned",
                                          per_device_train_batch_size = 4,
                                          gradient_accumulation_steps = 4,
                                          max_grad_norm = 0.3,
                                          model_optimizer = "paged_adamw_8bit",
                                          weight_decay=0.001,
                                          learning_rate = 2e-4,
                                          lr_scheduler_type = "linear",
                                          num_train_epochs = 1,
                                          max_steps = -1,
                                          save_strategy = "epoch",
                                          save_steps = 10,
                                          logging_steps = 10,
                                          fp16 = True,
                                          bf16 = False,
                                          gradient_checkpointing = True,
                                          group_by_length = True,
                                          push_to_hub = False
                                          )

In [19]:
trainer = set_supervised_finetuning_parameters( model=model,
                                                train_dataset=dataset,
                                                lora_config=lora_config,
                                                dataset_text_field="text",
                                                max_seq_length=None,
                                                tokenizer=tokenizer,
                                                trainig_args=training_arguments,
                                                )



Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

In [20]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.4029
20,1.3021
30,1.2999
40,1.4644
50,1.6314
60,1.1615
70,1.228
80,1.2168
90,1.4381
100,1.5867


TrainOutput(global_step=615, training_loss=1.3183420817057292, metrics={'train_runtime': 3796.8826, 'train_samples_per_second': 2.593, 'train_steps_per_second': 0.162, 'total_flos': 8.269550242578432e+16, 'train_loss': 1.3183420817057292, 'epoch': 1.0})

In [43]:
output_dir="./finetuned_llama2"

In [44]:
# Save trained model
trainer.model.save_pretrained(output_dir)