# Fine-tune Llama 3 with QLoRA

> üó£Ô∏è [Large Language Model Course](https://github.com/mlabonne/llm-course)

‚ù§Ô∏è Created by [@maximelabonne](https://twitter.com/maximelabonne).

You can run this notebook on Google Colab (I use an L4 GPU).

In [1]:
!pip install -qqq -U transformers datasets huggingface_hub accelerate peft bitsandbytes wandb trl --progress-bar off
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [3]:
import gc
import os

import torch
import wandb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

# Model
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "Meta-Llama-3-8B-qlora-pos-no-lang"

# Defined in the secrets tab in Google Colab
wb_token = '1d395c70839c926f2dce7fc9403ad88f09e490ba'
wandb.login(key=wb_token)

# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkevinxli[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

pos_tokens = ['pos:']

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)
model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
peft_model = get_peft_model(model, peft_config)
print(peft_model.print_trainable_parameters())
print(peft_model)

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5195983464188562
None
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterD

In [6]:
from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0):
    #default only option
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = f'{lang}:'

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)
    target_upos = features.ClassLabel(
        names=[
            "NOUN",
            "PUNCT",
            "ADP",
            "NUM",
            "SYM",
            "SCONJ",
            "ADJ",
            "PART",
            "DET",
            "CCONJ",
            "PROPN",
            "PRON",
            "X",
            "_",
            "ADV",
            "INTJ",
            "VERB",
            "AUX",
        ]
    )
    print("Target upos:", target_upos)

    def preprocess_function(examples):
        examples["inputs"] = [tag_name_dict["yue"] + ' '.join(examples["tokens"][i]) + "\n" + 
            ''.join(pos_tokens) + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in examples["pos_tags_ud"][i]) for i in range(len(examples["tokens"]))]
        return examples
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'], batched=True)
    
    dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_dataset, eval_dataset = dataset['train'], dataset['test']
    return train_dataset, eval_dataset, tag_name_dict

In [7]:
train_dataset, eval_dataset, tag_name_dict = get_dataset()

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)
Target upos: ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX'], id=None)


In [8]:
train_dataset[:10]

{'inputs': ['yue:Âôâ ËÄÉ Âîî ËÄÉ Âà∞ Âïä Ôºü\npos:cconj verb adv verb part part punct',
  'yue:Ë™í Ôºå ‰∏Ä Èöª Ë•øÊñΩ Ôºå ‰∏Ä Èöª ÊãâËñ© „ÄÇ\npos:intj punct num noun propn punct num noun propn punct',
  'yue:Êàë Ë®òÂæó - Ë®òÂæó Âó∞Èô£ÊôÇ Ëøî ÁæÖÂ∏´ ÈÉΩ ‰øÇ - ÈÉΩ ‰øÇ Âîî‰Ωø ‰∏Ä ÂÄã ÈêòÈ†≠ Âíã Âñé „ÄÇ Âç≥‰øÇ ÈñãÈ†≠ - ‰øÇ ÈñãÈ†≠ Ëøî Ëøî - Âç≥‰øÇ ÊúÄ ÈñãÈ†≠ ÂòÖ ÊôÇÂÄô ËøîÂ≠∏ Â∞± Ë¶Å ÊúÄ - ÊúÄ Â§ö ÊôÇÈñì Âõñ „ÄÇ Êàê ‰∏Ä ÂÄã ÈêòÈ†≠ Èõ∂ ‰∏â ÂÄã Â≠ó Ôºå ‰ΩÜ‰øÇ Ëøî ÁÜü Âíó Âë¢ Áü•ÈÅì Âï≤ Ë∑Ø Èªû Ë°å Âöπ Âó∞Âï≤ Âë¢ ‚Ä¶\npos:pron verb punct verb pron verb propn adv verb punct adv verb aux num noun noun part part punct cconj adv punct verb adv verb part punct cconj adv adv part noun verb adv verb adv punct adv adj noun part punct adj num noun noun num num noun noun punct cconj verb adj part part verb noun noun pron verb part pron part punct',
  'yue:ÂîîÂ•Ω Âïä „ÄÇ Âì©Âï≤ ÂíÅ technical Ôºå ÂÜá ËààË∂£ Âïä „ÄÇ\npos:aux part punct pron adv adj punct verb noun part punct',
  'yue:Ë´ó Ê∏ÖÊ•ö Âï≤ Ôºå ‰Ω†

In [9]:
eval_dataset[:10]

{'inputs': ['yue:Âï≤ - Âï≤ sales ÂòÖ Ë≥™Á¥† ‰∏ãÈôç Ôºå ÊúÉ Âîî ÊúÉ Â•Ω‰ºº Âêå ÈÜ´ÁÆ°Â±Ä Âó∞Âï≤ ‚Ä¶\npos:noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct',
  'yue:Âîâ Ôºå Êúâ ‰πúÂò¢ Ëæ¶Ê≥ï Âïä Ôºå Ëàà Âêñ Âóé Ôºå ÂíÅ Ëàà Âïä „ÄÇ\npos:intj punct verb pron noun part punct verb part part punct adv verb part punct',
  'yue:Â§† Âöπ „ÄÇ\npos:verb part punct',
  'yue:Âôâ Âè¶Â§ñ Â∞± Âêå ‰Ω¢ Â§™Â§™ ‰πãÈñì Âë¢ ‰∫¶ÈÉΩ Êúâ ÂÄã ‰∏≠Âπ¥ Â©öÂßª Âç±Ê©ü °Éâ „ÄÇ Âç≥‰øÇ Ë¶∫Âæó Â§™Â§™ Ôºå ÂÜá ÁêÜÁî± Êàë Â§™Â§™ ÊúÉ Èáç - Èáç ÊÑõ Êàë Âêñ „ÄÇ Âç≥‰øÇ Êàë ‰∫ãÊ•≠ Âèà Âîî Âæó Ôºå Ë¶Å Ê®£ ÂÜá Ê®£ Ôºå Ë¶Å Èå¢ ÂÜá Èå¢ ÂôâÊ®£ „ÄÇ Âôâ Âè¶Â§ñ Êúâ ÂÄã Èªë‰∫∫ ÂòÖ „ÄÇ Â∞± ‰øÇ È¨çÈ¨ö Èòø‰ºØ Ôºå Â∞± Êàê ‰∫îÂçÅ Ê≠≤ ÂòÖ „ÄÇ ÈªûËß£ ÊúÉ Êèæ Âà∞ ‰Ω¢ Ôºü Âõ†ÁÇ∫ ‰Ω¢ Ë¶Å Êèæ ÂÄã ‰∫∫ Êïô Ë∑≥Ëàû Âêñ Âóé „ÄÇ ËÄå Âì©ÂÄã Èòø‰ºØ ‰øÇ Ë≠ò Ë∑≥ ÊâÄÊúâ ÁàµÂ£´Ëàû Ôºå Âè™‰∏çÈÅé ‰øÇ Âõ†ÁÇ∫ È™®È†≠ Á°¨ „ÄÇ Âç≥‰øÇ Ë∑≥ Ëµ∑ Ë∫´ Ôºå Ë∑åËêΩ Âú∞‰∏ã ‰πãÂæå Â∞± Ëµ∑ Âîî Âà∞ Ë∫´ °Éâ Âòû „ÄÇ ‰ΩÜ‰øÇ ‰πãÂâç Âó∞ ÊÆµ ‰øÇ Âæó ÂòÖ „

In [10]:
# Used to suppress:
# Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
tokenizer.pad_token = tokenizer.eos_token

In [11]:
args = TrainingArguments(
    learning_rate=5e-05,
    lr_scheduler_type="linear",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=1,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir=f"./results-{new_model}/",
)

trainer = SFTTrainer(
    model=model,
    args=args,
    max_seq_length=512,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="inputs",
    tokenizer=tokenizer,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(new_model)

Map:   0%|          | 0/9720 [00:00<?, ? examples/s]

Map:   0%|          | 0/1081 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss,Validation Loss
162,1.5449,1.650903
324,1.1981,1.528955
486,1.5567,1.483526
648,1.527,1.446849
810,1.3092,1.431867




In [12]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/results-Meta-Llama-3-8B-qlora-pos-no-lang/commit/c7c4b1f0ff1aca15af818d37880684252607984e', commit_message='End of training', commit_description='', oid='c7c4b1f0ff1aca15af818d37880684252607984e', pr_url=None, pr_revision=None, pr_num=None)