<a href="https://colab.research.google.com/github/1028Luo/LLM-Domain-Specific-Assistant/blob/main/finetune_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning LLAMA

In [1]:
# install
!pip install -q accelerate==1.2.0
!pip install -q peft==0.14.0
!pip install -q bitsandbytes==0.45.0
!pip install -q transformers==4.47.1
!pip install -q trl==0.13.0
!pip install -q huggingface_hub
!pip install -q datasets==3.2.0
!pip install numba


# import
import os
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from numba import cuda

device = 'cuda'  # Use GPU

# Log in to Hugging Face
from google.colab import userdata
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)





[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.3/336.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# load model and tokenizer
model_name = "meta-llama/Llama-3.2-3B"
#model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

new_model = "/llama-3-3b-workout"

# dataset
dataset_name = "Jiexing1028/workout-plan"
dataset = load_dataset(dataset_name, split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/309 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/147 [00:00<?, ? examples/s]

In [3]:
# examine dataset

print(dataset)

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts


temp = formatting_prompts_func(dataset)
print(temp)

Dataset({
    features: ['question', 'answer'],
    num_rows: 147
})
["### Question: I'm 35 years old and weigh 150 pounds, what kind of workout plan is best for my age group to lose weight?\n ### Answer: **Personalized Workout Plan for a 35-year-old weighing 150 pounds**\n\nAs we age, our metabolism slows down, and losing weight can become more challenging. Given your age and weight, I recommend a workout plan that combines cardiovascular exercises with strength training to boost your metabolism and burn fat.\n\n**Recommended Workout Plan:**\n\n1. **Warm-up (5-10 minutes)**: Light cardio such as jogging, cycling, or rowing to get your heart rate up and loosen your muscles.\n2. **Resistance Training (30-40 minutes, 3 times a week)**:\n\t* Squats (3 sets of 12 reps): Works your legs, glutes, and core muscles, which are essential for burning calories.\n\t* Push-ups (3 sets of 12 reps): Targets your chest, shoulders, and triceps, improving overall upper body strength.\n\t* Lunges (3 sets 

In [4]:
# Parameters

######## Lora ########
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

######## Quantization using bitsandbytes ########

# Qlora stores model in 4 bit
use_4bit = True

# Quantization type (fp4 or nf4)
# fp4 is float-point 4, nf4 is a variant of fp4 tailored for deep learning
bnb_4bit_quant_type = "nf4"

# Dequantizes weights from 4 bit to 16 bit when they are needed for computation,
# hence the low vram usage
# Use higher precision in computation for numerical stability
bnb_4bit_compute_dtype = "float16"

# Double quantization
use_nested_quant = False

# compute type becomes torch.float16
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

######## SFTTrainer ########
_SFTConfig = SFTConfig(
    output_dir=new_model,
    dataset_text_field="text",
    max_seq_length=256,
    report_to="tensorboard", # by default needs login to wandb
    num_train_epochs = 10,
    logging_steps = 25, # log traning loss
    gradient_checkpointing = True, # reduce VRAM use by only saving some gradient
    warmup_ratio = 0.03, # gradually increase lr in the first 3% of steps, helps avoid training instability
    lr_scheduler_type = "cosine", # lr starts at max and then drop to near 0 following a cosine path
    weight_decay = 0.001, # a term in the loss function that descourage high value of weights
    optim = "paged_adamw_32bit" # a variant of adam using page to better manage memory
    )


In [5]:





# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training



trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=_SFTConfig,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
)

trainer.train()

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
25,1.412
50,1.3038
75,1.1722
100,1.0652
125,0.9873
150,0.9679
175,0.9441


TrainOutput(global_step=190, training_loss=1.1079910730060778, metrics={'train_runtime': 2143.3842, 'train_samples_per_second': 0.686, 'train_steps_per_second': 0.089, 'total_flos': 6374864364503040.0, 'train_loss': 1.1079910730060778, 'epoch': 10.0})

In [6]:
# test inference
prompt = "I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

baseline_model = "meta-llama/Llama-3.2-3B-instruct"
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model)
baseline_model = AutoModelForCausalLM.from_pretrained(
    baseline_model,
    load_in_4bit=True)

pipe = pipeline(task="text-generation", model=baseline_model, tokenizer=baseline_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  return fn(*args, **kwargs)


<s>[INST] I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury? [/INST] <s>[INST] I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury? [/INST]
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?
I'm 25 years old and have


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<s>[INST] I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury? [/INST] 

The exercises you should avoid include those that put excessive stress on the elbow joint, such as:

*   **Weightlifting**: Avoid heavy weights or resistance training that can put excessive strain on the elbow joint.
*   **Pumping or bending**: Avoid activities that involve repetitive bending or straightening of the elbow, such as pumping weights or bending at the elbows.
*   **Throwing or serving**: Avoid activities that involve throwing or serving, such as tennis, basketball, or volleyball, as they can put excessive stress on the elbow joint.
*   **Golfing**: Avoid golfing, as it involves repetitive swinging and bending at the elbows, which can put excessive stress on the elbow joint.

**Recommended Exercises**

Instead, focus on exercises that strengthen the muscles around the elbow joint, such as:

*   **Elbow flexion exercises


In [None]:
# 3B works much better than 1B



In [8]:
# Save trained model
trainer.push_to_hub()
#trainer.model.save_pretrained(push_to_hub=True, repo_name=new_model)
print('model pushed')

adapter_model.safetensors:   0%|          | 0.00/18.4M [00:00<?, ?B/s]

events.out.tfevents.1738658856.c8e37cf85553.540.0:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

model pushed


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

2464

In [None]:
# free VRAM
device = cuda.get_current_device()
device.reset()
print('finished')

finished
