<a href="https://colab.research.google.com/github/1028Luo/LLM-Domain-Specific-Assistant/blob/main/finetune_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning LLAMA

In [1]:
# install
!pip install -q accelerate==1.2.0
!pip install -q peft==0.14.0
!pip install -q bitsandbytes==0.45.0
!pip install -q transformers==4.47.1
!pip install -q trl==0.13.0
!pip install -q huggingface_hub
!pip install -q datasets==3.2.0
!pip install numba


# import
import os
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from numba import cuda

device = 'cuda'  # Use GPU

# Log in to Hugging Face
from google.colab import userdata
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.3/336.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# model name and tokenizer
base_model = "meta-llama/Llama-3.2-3B"

inst_model = "meta-llama/Llama-3.2-3B-instruct"
inst_tokenizer = AutoTokenizer.from_pretrained(inst_model)


#model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

new_model = "/llama-3-3b-workout"

# dataset
dataset_name = "Jiexing1028/workout-plan"
dataset = load_dataset(dataset_name, split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/311 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/773k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/780 [00:00<?, ? examples/s]

In [3]:
# examine dataset and format it

print(dataset)

LLAMA_31_prompt_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{sys_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{asst_msg}<|eot_id|>
"""


def formatting_prompts_func(example): # change this to fit data format of other model
    output_texts = []
    for i in range(len(example['question'])):
        #text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        text = LLAMA_31_prompt_template.format(sys_msg="you are a helpful assistant in generating personalised workout plans", user_msg= example['question'][i], asst_msg= {example['answer'][i]})
        output_texts.append(text)
    return output_texts


temp = formatting_prompts_func(dataset)
print(temp)

Dataset({
    features: ['question', 'answer'],
    num_rows: 780
})
['\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nyou are a helpful assistant in generating personalised workout plans<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI\'m 35 years old and have a desk job, what\'s the best workout plan to help me lose weight and improve overall health?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{"Here\'s a possible answer:\\n\\n**Personalized Workout Plan:**\\n\\nAs a 35-year-old with a desk job, your sedentary lifestyle can lead to weight gain and increased risk of chronic diseases. To help you lose weight and improve overall health, I recommend a workout plan that combines cardiovascular exercise, strength training, and flexibility exercises.\\n\\n**Warm-up (5-10 minutes):**\\n\\n* Light cardio such as brisk walking, jogging in place, or jumping jacks to get your heart rate up and loosen your muscles.\\n\\n**Monday ( Upper Body and Cardio):**\\n\

In [4]:
# Parameters

######## Lora ########
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

######## Quantization using bitsandbytes ########

# Qlora stores model in 4 bit
use_4bit = True

# Quantization type (fp4 or nf4)
# fp4 is float-point 4, nf4 is a variant of fp4 tailored for deep learning
bnb_4bit_quant_type = "nf4"

# Dequantizes weights from 4 bit to 16 bit when they are needed for computation,
# hence the low vram usage
# Use higher precision in computation for numerical stability
bnb_4bit_compute_dtype = "float16"

# Double quantization
use_nested_quant = False

# compute type becomes torch.float16
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

######## SFTTrainer ########
_SFTConfig = SFTConfig(
    output_dir=new_model,
    dataset_text_field="text", # will be used only when no formatting function
    max_seq_length=1024, # default to model maximum
    report_to="tensorboard", # by default needs login to wandb
    num_train_epochs = 3,
    logging_steps = 25, # log traning loss
    gradient_checkpointing = True, # reduce VRAM use by only saving some gradient
    warmup_ratio = 0.03, # gradually increase lr in the first 3% of steps, helps avoid training instability
    lr_scheduler_type = "cosine", # lr starts at max and then drop to near 0 following a cosine path
    weight_decay = 0.001, # a term in the loss function that descourage high value of weights
    optim = "paged_adamw_32bit" # a variant of adam using page to better manage memory
    )


In [5]:


# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = inst_tokenizer.chat_template
tokenizer.eos_token_id = inst_tokenizer.eos_token_id


# tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

inst_tokenizer.pad_token = inst_tokenizer.eos_token
# aaa
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=_SFTConfig,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    tokenizer = inst_tokenizer
)

trainer.train()

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

  trainer = SFTTrainer(


Map:   0%|          | 0/780 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
25,1.4533
50,1.3359
75,1.201
100,1.0863
125,1.0143
150,0.9344
175,0.8892
200,0.8707
225,0.8643
250,0.8635


TrainOutput(global_step=294, training_loss=1.0226926089955024, metrics={'train_runtime': 4594.573, 'train_samples_per_second': 0.509, 'train_steps_per_second': 0.064, 'total_flos': 3.075872055872717e+16, 'train_loss': 1.0226926089955024, 'epoch': 3.0})

# Inference

In [7]:
# prompt

LLAMA_31_chat_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{sys_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# AAA: need prompt template here? tokenize prompt here to see what it looks like
# test inference
prompt = "I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?"






In [None]:
# inference with the instruct model
inst_model = AutoModelForCausalLM.from_pretrained(
    inst_model,
    load_in_4bit=True)

pipe = pipeline(task="text-generation", model=inst_model, tokenizer=inst_tokenizer, max_length=200, return_full_text=True)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<s>[INST] I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury? [/INST] 

The exercises you should avoid to prevent further injury are:

*   **Bending or flexing movements**: Avoid activities that involve bending or flexing your elbow, such as heavy lifting, bending, or twisting. This can put additional stress on the joint and exacerbate the injury.
*   **Prolonged overhead activities**: Activities that involve overhead movements, such as lifting, throwing, or swinging, can also put excessive stress on the elbow joint.
*   **Contact sports**: Avoid contact sports that involve elbow strikes, such as boxing, wrestling, or rugby.
*   **Heavy weightlifting**: Avoid heavy weightlifting or resistance exercises that involve bending or flexing the elbow.

Exercises to avoid are:

*   **Bicep curls**: Avoid bicep curls, which can put additional stress on the elbow joint.
*  


In [9]:
# inference with the fine tuned model


messages = [
    {"role": "system", "content": "you are a helpful assistant in generating personalised workout plans."},
    {"role": "user", "content": "I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?"},
]

# This is required because the base model have different eos
# So here use the eos token of the instruct model
# To see a bit more of this:
#   print(model.config)
print(tokenizer.eos_token_id)
print(inst_tokenizer.eos_token_id)
model.config.pad_token_id = inst_tokenizer.pad_token_id
model.config.eos_token_id = inst_tokenizer.eos_token_id

tokenizer.chat_template = inst_tokenizer.chat_template


pipe = pipeline(task="text-generation", model=model, tokenizer=inst_tokenizer, max_length=250, return_full_text=False, temperature=1.2, repetition_penalty=1.1)
#result = pipe(LLAMA_31_chat_template.format(sys_msg="you are a helpful assistant in generating personalised workout plans", user_msg=prompt))
#result = pipe(prompt)
result = pipe(messages)
print(result[0]['generated_text'])





Device set to use cuda:0


128009
128009
[{'role': 'system', 'content': 'you are a helpful assistant in generating personalised workout plans.'}, {'role': 'user', 'content': "I'm 25 years old and have severe elbow injury, what exercises should I avoid to prevent further injury?"}, {'role': 'assistant', 'content': "The combination of weight loss and exercise is essential for maintaining a healthy lifestyle. As you're seeking personalized advice, consider the impact your age has on potential side effects. Firstly, weight loss shouldn't involve any risk of injuring the elbow! It's best advised that any form of resistance training involving heavy weights be avoided or modified to minimize stress on this delicate joint. This recommendation holds especially true during the current phase when you'll be embarking upon intensive aerobic efforts.\n\nAssuming no underlying medical conditions, here's a modified routine based on low-impact aerobics exercises that will help you achieve and maintain weight loss while preservin

In [None]:
print(model.config)
print(tokenizer.eos_token_id)
print(inst_tokenizer.eos_token_id)

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-3B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pad_token_id": 128009,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bi

In [None]:
# 3B works much better than 1B
# TODO:
# 1. print pretrain's tokenizer.(promt) and instruct's, see what's the difference
# 2. find why the fine-tuned repeat itself
#   2.1 repetition penalty and temperature help reduce repetition
#   2.2 it does not learn to stop generating output, so when repetition is fixed by 2.1,
#   it keeps asking itself questions based on the context
# 3. why it does not generate output
#   3.1 something wrong with promt template
#   3.2 try format a message instead of using prompt template
# 4. Why the output has strange symbols
#   4.1 training and inferencing with inst tokenizer still gives strange symbol,
#       try using base tokenizer, add chat template to it and change eos id
# 5. Bigger dataset really helped.
#   5.1 increaseing data size from 150 to 350 improves performance by a lot.
#       Cleaning low quality data also helps.

In [8]:
# Save trained model
trainer.push_to_hub(commit_message = "increased dataset sizeaaa")
#trainer.model.save_pretrained(push_to_hub=True, repo_name=new_model)
print('model pushed')

No files have been modified since last commit. Skipping to prevent empty commit.


model pushed


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

2464

In [None]:
# free VRAM
device = cuda.get_current_device()
device.reset()
print('finished')

finished
