# Fine-tune Llama 3 with QLoRA

> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)

❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).

You can run this notebook on Google Colab (I use an L4 GPU).

In [1]:
!pip install -qqq -U transformers datasets huggingface_hub accelerate peft bitsandbytes wandb trl tqdm --progress-bar off
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import gc
import os

import torch
import wandb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

# Model
base_model = "meta-llama/Meta-Llama-3-8B"
base2_model = "AlienKevin/Meta-Llama-3-8B-tagllm-lang-1-fixed-embed"
new_model = "Meta-Llama-3-8B-tagllm-pos-1-fixed-embed"

# Defined in the secrets tab in Google Colab
wb_token = '1d395c70839c926f2dce7fc9403ad88f09e490ba'
wandb.login(key=wb_token)

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkevinxli[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

num_token_per_tag = 1
lang_tokens = ['<|TOK0|>', '<|TOK1|>', '<|TOK2|>']
pos_tokens = ['<|TOK200|>']

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base2_model, additional_special_tokens=lang_tokens + pos_tokens)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)
# Exclude the new translation_tag, expand embedding again after loading the LoRA weights
model.resize_token_embeddings(len(tokenizer) - len(pos_tokens))
model = PeftModel.from_pretrained(model, base2_model)
model = model.merge_and_unload()
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.37G [00:00<?, ?B/s]



In [5]:
peft_model = get_peft_model(model, peft_config)
print(peft_model.print_trainable_parameters())
print(peft_model)

trainable params: 41,943,040 || all params: 8,072,237,056 || trainable%: 0.5195962371895932
None
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128260, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterD

In [11]:
from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0):
    #default only option
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = "".join([f'<|TOK{i}|>' for i in range(num_existing_tokens, num_existing_tokens + num_token_per_tag)])
        num_existing_tokens += num_token_per_tag

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)
    target_upos = features.ClassLabel(
        names=[
            "NOUN",
            "PUNCT",
            "ADP",
            "NUM",
            "SYM",
            "SCONJ",
            "ADJ",
            "PART",
            "DET",
            "CCONJ",
            "PROPN",
            "PRON",
            "X",
            "_",
            "ADV",
            "INTJ",
            "VERB",
            "AUX",
        ]
    )
    print("Target upos:", target_upos)

    def preprocess_function(examples):
        examples["inputs"] = [tag_name_dict["yue"] + ' '.join(examples["tokens"][i]) + "\n" + 
            ''.join(pos_tokens) + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in examples["pos_tags_ud"][i]) for i in range(len(examples["tokens"]))]
        return examples
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'], batched=True)
    
    dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_dataset, eval_dataset = dataset['train'], dataset['test']
    return train_dataset, eval_dataset, tag_name_dict

In [12]:
train_dataset, eval_dataset, tag_name_dict = get_dataset()

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)
Target upos: ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX'], id=None)


In [13]:
train_dataset[:10]

{'inputs': ['<|TOK1|>噉 考 唔 考 到 啊 ？\n<|TOK200|>cconj verb adv verb part part punct',
  '<|TOK1|>誒 ， 一 隻 西施 ， 一 隻 拉薩 。\n<|TOK200|>intj punct num noun propn punct num noun propn punct',
  '<|TOK1|>我 記得 - 記得 嗰陣時 返 羅師 都 係 - 都 係 唔使 一 個 鐘頭 咋 喎 。 即係 開頭 - 係 開頭 返 返 - 即係 最 開頭 嘅 時候 返學 就 要 最 - 最 多 時間 囖 。 成 一 個 鐘頭 零 三 個 字 ， 但係 返 熟 咗 呢 知道 啲 路 點 行 嚹 嗰啲 呢 …\n<|TOK200|>pron verb punct verb pron verb propn adv verb punct adv verb aux num noun noun part part punct cconj adv punct verb adv verb part punct cconj adv adv part noun verb adv verb adv punct adv adj noun part punct adj num noun noun num num noun noun punct cconj verb adj part part verb noun noun pron verb part pron part punct',
  '<|TOK1|>唔好 啊 。 哩啲 咁 technical ， 冇 興趣 啊 。\n<|TOK200|>aux part punct pron adv adj punct verb noun part punct',
  '<|TOK1|>諗 清楚 啲 ， 你 讀 語文 你 應該 知 哩個 ， 哩 句 說話 嘅 意思 𡃉 喎 。\n<|TOK200|>verb adj part punct pron verb noun pron aux verb pron punct pron noun noun part noun part part punct',
  '<|TOK1|>噉 咪 係 囖 。\n<|TOK200|>cconj ad

In [15]:
eval_dataset[:10]

{'inputs': ['<|TOK1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|TOK200|>noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct',
  '<|TOK1|>唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。\n<|TOK200|>intj punct verb pron noun part punct verb part part punct adv verb part punct',
  '<|TOK1|>夠 嚹 。\n<|TOK200|>verb part punct',
  '<|TOK1|>噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。\n<|TOK200|>cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron part punct cconj pron noun adv adv verb punct aux noun verb noun punct aux noun verb noun cconj punct cconj cconj verb noun noun part punct adv verb noun noun punct adv num num noun part punct pron aux verb 

In [16]:
# Used to suppress:
# Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
tokenizer.pad_token = tokenizer.eos_token

In [17]:
args = TrainingArguments(
    learning_rate=5e-05,
    lr_scheduler_type="linear",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    save_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir=f"./results-{new_model}/",
)

trainer = SFTTrainer(
    model=model,
    args=args,
    max_seq_length=512,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="inputs",
    tokenizer=tokenizer,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(new_model)

Map:   0%|          | 0/9720 [00:00<?, ? examples/s]

Map:   0%|          | 0/1081 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss,Validation Loss
243,1.5852,1.713258
486,1.2777,1.594335
729,1.7293,1.550665
972,1.7879,1.510303
1215,1.4942,1.495272




In [18]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/results-Meta-Llama-3-8B-tagllm-pos-1-fixed-embed/commit/036d0631025e7376503662b08c4a13028e3f221b', commit_message='End of training', commit_description='', oid='036d0631025e7376503662b08c4a13028e3f221b', pr_url=None, pr_revision=None, pr_num=None)