<a href="https://colab.research.google.com/github/CallieHsu/Fine-tune-LLM-Google-Colab/blob/master/fine_tune_tinyllama_by_colab_docker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tiny Llama Fine-Tuning using QLoRA

## 安裝所需套件

In [None]:
# 安裝所需的 Python 套件
!pip install -q --upgrade pip
!pip install -q accelerate==0.26.1 peft==0.7.1 bitsandbytes==0.42.0 transformers trl==0.7.10 huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m498.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m660.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m662.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

## Check your Nvidia GPU

In [None]:
!nvidia-smi

Tue Feb 20 08:49:06 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.10   Driver Version: 470.141.10   CUDA Version: 12.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-DGXS...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   41C    P0    38W / 300W |     72MiB / 32508MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   40C    P0    38W / 300W |      5MiB / 32508MiB |      0%      Default |
|       

In [None]:
# 匯入必要的模組和套件
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

## 設定參數

- model_name : 預訓練模型的名稱。
- dataset_name : 訓練數據集的文件路徑。
- new_model : 新模型的名稱。
- lora_r, lora_alpha, lora_dropout : LoRA 的相關參數。
- use_4bit : 是否使用 4 位定點數。
- bnb_4bit_compute_dtype : 4 位定點數計算的數據類型。
- bnb_4bit_quant_type : 4 位定點數的量化類型。
- use_nested_quant : 是否使用嵌套量化。
- output_dir : 輸出的目錄。
- num_train_epochs : 訓練的週期數。
- fp16 : 是否使用 16 位浮點數。
- bf16 : 是否使用 bfloat16。
- per_device_train_batch_size : 每個設備的訓練批次大小。
- per_device_eval_batch_size : 每個設備的評估批次大小。
- gradient_accumulation_steps : 梯度累積的步數。
- gradient_checkpointing : 是否使用梯度檢查點。
- max_grad_norm : 梯度的最大範數。
- learning_rate : 學習速率。
- weight_decay : 權重衰減。
- optim : 優化器。
- lr_scheduler_type : 學習速率的調整方式。
- max_steps : 最大的訓練步數。
- warmup_ratio : 學習速率的熱身比例。
- group_by_length : 是否根據句子的長度將它們分組。
- save_steps : 模型保存的步數。
- logging_steps : 日誌記錄的步數。
- max_seq_length : 輸入序列的最大長度。
- packing : 是否打包序列。
- device_map : 使用哪一個GPU。
- 詳細參數：https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.6" # 欲訓練的HuggineFace model name
new_model = "tiny-llama-shuttle-xpc-cube-en5000" # 新model的名稱

################################################################################
# Quantized LLMs with Low-Rank Adapters (QLoRA) parameters
################################################################################
lora_r = 64
lora_alpha = 32
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters 輕量級封裝，專門用於CUDA自定義函數，特別是8位優化器、矩陣乘法和量化
################################################################################
use_4bit = True
bnb_4bit_compute_dtype = "float16" # float16 or bfloat16
bnb_4bit_quant_type = "nf4" # fp4 or nf4
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
output_dir = "./results"
num_train_epochs = 5000
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 50
logging_steps = 50

################################################################################
# Supervised finetuning (SFT) parameters
################################################################################
max_seq_length = None
packing = False
device_map = {"": 0} #{"": 0} or "auto"

## 讀取資料集 & 前處理

Tiny-llama
訓練用chat prompt template格式:

```<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>\n```

問答資料預先整理成JSON格式, example:
```
{"input": "What models are part of the Shuttle XPC cube series?", "output": "The Shuttle XPC cube series includes SH610R4, SW580R8, SH510R4, SH570R8, SH570R6, SH370R6 V2, SH310R4 V2, SH370R8, SH310R4, SH370R6, SZ270R9, SZ270R8."}
{"input": "Can you list the different models in the Shuttle XPC cube series?", "output": "The Shuttle XPC cube series comprises SH610R4, SW580R8, SH510R4, SH570R8, SH570R6, SH370R6 V2, SH310R4 V2, SH370R8, SH310R4, SH370R6, SZ270R9, SZ270R8."}
```

In [None]:

dataset_file = './shuttle-xpc-cube-en.jsonl'

def format_chat(prompt, response):
    return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>\n"

def preprocess_data(data_entry):
    return {'text': [format_chat(*entry) for entry in zip(data_entry['input'], data_entry['output'])]}

# 讀取資料集
ds = load_dataset('json', data_files=dataset_file, split="train")

# 分割成 train: 90%, test: 10%
ds_train_valid = ds.train_test_split(test_size=0.1, seed=42)

# batched=True -> 允許使用map一次套用
train_dataset = ds_train_valid['train'].map(preprocess_data, batched=True)
valid_dataset = ds_train_valid['test'].map(preprocess_data, batched=True)

## 下載模型及微調模型

In [None]:
# 定義位元和字節量化的相關配置
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# 檢查 GPU 是否與 bfloat16 相容
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# 從預訓練模型中載入自動生成模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# 載入與模型對應的分詞器
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# 定義 Prompt Engineering Fine-Tuning （PEFT）的相關設定
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# 設置訓練參數
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard", #"all"
    evaluation_strategy="steps",
    eval_steps=5,  # 每5部驗證
    load_best_model_at_end=True #將最佳評估結果的模型讀出來
)

# 使用 SFTTrainer 進行監督式微調訓練
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset, # 在這裡傳入驗證數據集
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# 開始訓練模型
trainer.train()

# 儲存微調後的模型
trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

## 建立chat prompt模板

In [None]:
# def formatted_prompt(question)-> str:
#     return f"### user:\n{question}\n### assistant:\n"

def template_prompt(question)-> str:
    # 參考自transformers的apply_chat_template api的輸出格式
    prompt = f"""<|system|>
Below is an instruction that describes a task. Write a response that appropriately completes the request.</s>
<|user|>
{question}</s>
<|assistant|>
"""
    return prompt

## 模型合併並儲存



In [None]:
model_path = new_model  # 更改為您的路徑

# 以FP16重新載入模型並將其與LoRA權重合併
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# 重新載入分詞器以進行保存
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 儲存合併後的模型
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./tiny-llama-shuttle-xpc-cube-en2000/tokenizer_config.json',
 './tiny-llama-shuttle-xpc-cube-en2000/special_tokens_map.json',
 './tiny-llama-shuttle-xpc-cube-en2000/tokenizer.json')

In [None]:
# Fix unicode problem in Colab
import locale
print(locale.getpreferredencoding())
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

## 上傳至HuggingFace

In [None]:
# make sure using your WRITE token
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!huggingface-cli whoami

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

calliehsu


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/calliehsu/tiny-llama-shuttle-xpc-cube-en2000/commit/16db04d7c59560dba7ad7c286a12201f594cb2a4', commit_message='Upload tokenizer', commit_description='', oid='16db04d7c59560dba7ad7c286a12201f594cb2a4', pr_url=None, pr_revision=None, pr_num=None)

# 載入微調後的模型並執行推論

### Fine tune model

In [None]:
# from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_path = new_model

model = AutoModelForCausalLM.from_pretrained(model_path,
                         device_map="auto",
                         offload_folder="offload",
                         torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
prompts = [
    "Provide the names of the models within the Shuttle XPC cube series.",
    "What is the processor support for the Shuttle XPC cube SH610R4?",
    "Which generation of Intel processors does the Shuttle XPC cube SW580R8 support?",
    "What are the CPU choices for the Shuttle XPC cube SH510R4?",
    "Tell me about the chipset used in the Shuttle XPC cube SH570R8.",
    "What are the dimensions of the Shuttle XPC cube SZ270R9?",
    "What is the maximum power output of the power supply in the Shuttle XPC cube SH310R4 V2?",
    "Please provide me the comparison of CPU specifications of Shuttle xpc cube series."
]

gen = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
for prompt in prompts:
    print("Q: ", prompt)
    prompt = template_prompt(prompt)

    result = gen(
        prompt,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=256,
    )
    print("A: ", result[0]['generated_text'].split("<|assistant|>")[-1].strip(), "\n") # 輸出生成的文本

### Raw pretrained model

In [None]:
# 從預訓練模型中載入自動生成模型
model0 = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model0.config.use_cache = False

# 載入與模型對應的分詞器
tokenizer0 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer0.pad_token = tokenizer0.eos_token

In [None]:
prompts = [
    "Shuttle xpc cube系列有哪些型號?",
    "SH610R4的規格是?",
    "SH510R4的技術規格?",
    "SH370R6的規格是什麼?",
    "SH370R6支援哪些CPU?",
    "SH570R8的操作溫度範圍是多少?",
    "SH370R6 V2的規格是什麼?",
    "SZ270R9的尺寸是多少?",
]

pipe = pipeline(task="text-generation", model=model0, tokenizer=tokenizer0)
for prompt in prompts:
    print("Q: ", prompt)
    prompt = template_prompt(prompt)

    result = pipe(
        prompt,
        num_return_sequences=1,
        eos_token_id=tokenizer0.eos_token_id,
        max_new_tokens=128,
        repetition_penalty=1.1
    )
    print("A: ", result[0]['generated_text'].split("<|assistant|>")[-1].strip(), "\n") # 輸出生成的文本

**code reference:**
- Llama2 finetune: https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html