# Fine-tune Llama 2 in Google Colab


###Installing required packages for Google colab

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
#peft: Parameter-Efficient Fine-Tuning (PEFT) methods
#Accelerate: Accelerate is a library that enables the same PyTorch code to be run across any distributed configuration by adding just four lines of code!
#bitsandbytes: The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
#transformers: Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models.
#trl: TRL is a full stack library where we provide a set of tools to train transformer language models with Reinforcement Learning, from the Supervised Fine-tuning step (SFT), Reward Modeling step (RM) to the Proximal Policy Optimization (PPO) step.



In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, #predict next token task
    AutoTokenizer, #class for tokenization (from words to ids)
    BitsAndBytesConfig,
    HfArgumentParser, #
    TrainingArguments, #clear
    pipeline, # Pipeline to generate outputs from model
    logging, #to log training loss and results
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer #Supervised Finetuning Trainer (SFT Trainer)

###Connecting to *huggingface*

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

###defining important parameters

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/llama-2-7b-chat-hf"


# Fine-tuned model name
new_model = "llama-2-AZ"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
# lora_r = 64 #QLoRA will use a rank of 64
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 16
# lora_alpha = 8

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1 #Gradient accumulation is a technique where you can train on bigger batch sizes than your machine would normally be able to fit into memory.
                                #This is done by accumulating gradients over several batches, and only stepping the optimizer after a certain number of batches have been performed.

# Enable gradient checkpointing
gradient_checkpointing = True #saving in disk memory

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3 #to prevent gradien explosion

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001 #Weight decay is a regularization technique by adding a small penalty,
                      #usually the L2 norm of the weights (all the weights of the model), to the loss function.

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine" #how learning rate should change based on changes on the performance

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03 #Warm-up is a way to reduce the primacy effect of the early training examples.
                #Without it, you may need to run a few extra epochs to get the convergence desired, as the model un-trains those early superstitions.
                #Constant: Use a low learning rate than base learning rate for the initial few steps.
                #gradual: In the first few steps, the learning rate is set to be lower than base learning rate and increased gradually to approach it as step number increases.

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 5

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

###Loading the dataset

In [None]:
# Load dataset
#dataset = load_dataset("roneneldan/TinyStories", split="train")
dataset = load_dataset("roneneldan/TinyStories", split="train[1:500]") # normally we should upload all the training data but here we only used 500 instances to get started quickly with the code


Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]



Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 499
})

### Configuration of bitsandbytes

In [None]:
#configuring bitsandbytes for 4-bit quantization.

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

### Loading the model

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

### Loading the model's tokenizer

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

### Loading LoRA configuration


In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

### Loading the trainng arguments

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    push_to_hub=True
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/499 [00:00<?, ? examples/s]

###Training the model

In [None]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
5,1.6616
10,1.582
15,1.5149
20,1.5313
25,1.5427
30,1.51
35,1.5144
40,1.4519
45,1.3991
50,1.3992


TrainOutput(global_step=125, training_loss=1.4161358222961427, metrics={'train_runtime': 364.5944, 'train_samples_per_second': 1.369, 'train_steps_per_second': 0.343, 'total_flos': 2182749625294848.0, 'train_loss': 1.4161358222961427, 'epoch': 1.0})

###Save the model in the hard disk

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

###Generate text using a prompt

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is a large language model? [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text. It is designed to learn and understand the patterns and structures of language, and can be used for a variety of tasks such as language translation, text summarization, and language generation.

Large language models are typically trained on vast amounts of text data, such as books, articles, and websites. The more data the model is trained on, the better it can understand and generate language. These models are often called "large" because they can process and analyze large amounts of text quickly and accurately.

Some examples of large language models include:

1. BERT (Bidirectional Encoder Representations from Transformers): Developed by Google, BERT is a powerful language model that has achieved state-of-the-art results on a wide range of


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Tell me a story about a little girl"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Tell me a story about a little girl [/INST]  Once upon a time, there was a little girl named Lily. She was very curious and loved to explore the world around her. One day, she was playing in the garden when she saw a little bird. The bird was so small and cute, and Lily wanted to catch it.

She tried to chase the bird, but it was too fast. The bird flew away, and Lily was left feeling disappointed. She wanted to catch the bird so badly, but it was too quick for her.

Just then, a kind old man appeared. He smiled at Lily and said, "Don't worry, little one. I'll help you catch the bird." He took Lily's hand and they both chased after the bird.

After a while, they finally caught the bird. Lily was so happy and thanked the old


###Save to huggingface the finetuned model

We have to reload the session and reload the model using Float16 instead of 4bits because we can't push into huggingface using 4bits for each model so we have to merge the models in Float16 then pushing it to huggingface.

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# !huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HafsaaO/llama-2-AZ/commit/5a73c1031666a19eb08f3864e78e1824777d4e19', commit_message='Upload tokenizer', commit_description='', oid='5a73c1031666a19eb08f3864e78e1824777d4e19', pr_url=None, pr_revision=None, pr_num=None)