In [3]:
# !sudo apt -qq install build-essential -y
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install -qqq -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install -qqq -U --no-deps packaging ninja einops xformers trl peft accelerate bitsandbytes
    !FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install -qqq -U flash-attn --no-build-isolation pip install flash-attn --progress-bar off
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install -qqq -U wandb huggingface_hub

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import wandb
# Defined in the secrets tab in Google Colab
wb_token = '1d395c70839c926f2dce7fc9403ad88f09e490ba'
wandb.login(key=wb_token)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkevinxli[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

base_model = "results-Meta-Llama-3-8B-tagllm-lang-1-reserved-unsloth"
new_model = "Meta-Llama-3-8B-tagllm-pos-1-reserved-unsloth"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [7]:
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj",],
#     modules_to_save = ["embed_tokens"],
#     lora_alpha = 32,
#     lora_dropout = 0, # Supports any, but = 0 is optimized
#     bias = "none",    # Supports any, but = "none" is optimized
#     # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
#     use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
#     random_state = 3407,
#     use_rslora = False,  # We support rank stabilized LoRA
#     loftq_config = None, # And LoftQ
# )

<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `ChatML` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [8]:
pos_tokens = ['<|reserved_special_token_200|>']
num_token_per_tag = 1

In [9]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

from datasets import load_dataset, features

def patch_v(tag):
    if tag == 'V':
        return 'VERB'
    else:
        return tag

def get_dataset(num_existing_tokens=0):
    #default only option
    dataset = load_dataset("hkcancor", "default")

    single_lang = ["eng", "yue", "cmn"]

    tag_name_dict = {}
    for lang in single_lang:
        tag_name_dict[lang] = "".join([f'<|reserved_special_token_{i}|>' for i in range(num_existing_tokens, num_existing_tokens + num_token_per_tag)])
        num_existing_tokens += num_token_per_tag

    source_upos = dataset['train'].features["pos_tags_ud"].feature
    print("Source upos:", source_upos)
    target_upos = features.ClassLabel(
        names=[
            "NOUN",
            "PUNCT",
            "ADP",
            "NUM",
            "SYM",
            "SCONJ",
            "ADJ",
            "PART",
            "DET",
            "CCONJ",
            "PROPN",
            "PRON",
            "X",
            "_",
            "ADV",
            "INTJ",
            "VERB",
            "AUX",
        ]
    )
    print("Target upos:", target_upos)

    def preprocess_function(examples):
        examples["inputs"] = [tag_name_dict["yue"] + ' '.join(examples["tokens"][i]) + "\n" + 
            ''.join(pos_tokens) + ' '.join(patch_v(source_upos.int2str(tag)).lower()
            for tag in examples["pos_tags_ud"][i]) + EOS_TOKEN for i in range(len(examples["tokens"]))]
        return examples
    
    dataset['train'] = dataset['train'].map(preprocess_function, remove_columns=
        ['tokens', 'conversation_id', 'pos_tags_prf', 'pos_tags_ud', 'speaker', 'transcriptions', 'turn_number', 'tokens'], batched=True)
    
    dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_dataset, eval_dataset = dataset['train'], dataset['test']
    return train_dataset, eval_dataset, tag_name_dict

In [10]:
train_dataset, eval_dataset, tag_name_dict = get_dataset(num_existing_tokens=0)

Source upos: ClassLabel(names=['NUM', 'ADP', 'INTJ', 'PROPN', 'ADJ', 'V', 'DET', 'ADV', 'CCONJ', 'PRON', 'X', 'PART', 'AUX', 'VERB', 'NOUN', 'PUNCT'], id=None)
Target upos: ClassLabel(names=['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX'], id=None)


In [11]:
train_dataset[:10]

{'inputs': ['<|reserved_special_token_1|>噉 考 唔 考 到 啊 ？\n<|reserved_special_token_200|>cconj verb adv verb part part punct<|end_of_text|>',
  '<|reserved_special_token_1|>誒 ， 一 隻 西施 ， 一 隻 拉薩 。\n<|reserved_special_token_200|>intj punct num noun propn punct num noun propn punct<|end_of_text|>',
  '<|reserved_special_token_1|>我 記得 - 記得 嗰陣時 返 羅師 都 係 - 都 係 唔使 一 個 鐘頭 咋 喎 。 即係 開頭 - 係 開頭 返 返 - 即係 最 開頭 嘅 時候 返學 就 要 最 - 最 多 時間 囖 。 成 一 個 鐘頭 零 三 個 字 ， 但係 返 熟 咗 呢 知道 啲 路 點 行 嚹 嗰啲 呢 …\n<|reserved_special_token_200|>pron verb punct verb pron verb propn adv verb punct adv verb aux num noun noun part part punct cconj adv punct verb adv verb part punct cconj adv adv part noun verb adv verb adv punct adv adj noun part punct adj num noun noun num num noun noun punct cconj verb adj part part verb noun noun pron verb part pron part punct<|end_of_text|>',
  '<|reserved_special_token_1|>唔好 啊 。 哩啲 咁 technical ， 冇 興趣 啊 。\n<|reserved_special_token_200|>aux part punct pron adv adj punct verb noun part punct<|end_of_

In [12]:
eval_dataset[:10]

{'inputs': ['<|reserved_special_token_1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|reserved_special_token_200|>noun punct noun noun part noun verb punct aux adv aux verb adp propn pron punct<|end_of_text|>',
  '<|reserved_special_token_1|>唉 ， 有 乜嘢 辦法 啊 ， 興 吖 嗎 ， 咁 興 啊 。\n<|reserved_special_token_200|>intj punct verb pron noun part punct verb part part punct adv verb part punct<|end_of_text|>',
  '<|reserved_special_token_1|>夠 嚹 。\n<|reserved_special_token_200|>verb part punct<|end_of_text|>',
  '<|reserved_special_token_1|>噉 另外 就 同 佢 太太 之間 呢 亦都 有 個 中年 婚姻 危機 𡃉 。 即係 覺得 太太 ， 冇 理由 我 太太 會 重 - 重 愛 我 吖 。 即係 我 事業 又 唔 得 ， 要 樣 冇 樣 ， 要 錢 冇 錢 噉樣 。 噉 另外 有 個 黑人 嘅 。 就 係 鬍鬚 阿伯 ， 就 成 五十 歲 嘅 。 點解 會 揾 到 佢 ？ 因為 佢 要 揾 個 人 教 跳舞 吖 嗎 。 而 哩個 阿伯 係 識 跳 所有 爵士舞 ， 只不過 係 因為 骨頭 硬 。 即係 跳 起 身 ， 跌落 地下 之後 就 起 唔 到 身 𡃉 嘞 。 但係 之前 嗰 段 係 得 嘅 。\n<|reserved_special_token_200|>cconj cconj adv adp pron noun adv part adv verb noun noun noun noun part punct cconj verb noun punct verb noun pron noun aux adv punct adv verb pron pa

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field = "inputs",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        learning_rate=5e-05,
        lr_scheduler_type="linear",
        per_device_train_batch_size=12,
        per_device_eval_batch_size=12,
        gradient_accumulation_steps=1,
        optim="paged_adamw_8bit",
        num_train_epochs=1,
        evaluation_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        warmup_steps=10,
        report_to="wandb",
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        output_dir=f"./results-{new_model}/",
    ),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.65 GB.
9.676 GB of memory reserved.


In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 9,720 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 12 | Gradient Accumulation steps = 1
\        /    Total batch size = 12 | Total steps = 810
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
162,1.9788,2.0304
324,1.7172,1.887105
486,1.9543,1.841993
648,2.2679,1.805602
810,1.6227,1.791716


In [16]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

847.2891 seconds used for training.
14.12 minutes used for training.
Peak reserved memory = 22.629 GB.
Peak reserved memory for training = 12.953 GB.
Peak reserved memory % of max memory = 95.683 %.
Peak reserved memory for training % of max memory = 54.77 %.


In [17]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlienKevin/results-Meta-Llama-3-8B-tagllm-pos-1-reserved-unsloth/commit/a22c5be88706d9a9279899c1ed6682fee9544530', commit_message='End of training', commit_description='', oid='a22c5be88706d9a9279899c1ed6682fee9544530', pr_url=None, pr_revision=None, pr_num=None)

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [19]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "<|reserved_special_token_1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|reserved_special_token_200|>"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|reserved_special_token_1|>啲 - 啲 sales 嘅 質素 下降 ， 會 唔 會 好似 同 醫管局 嗰啲 …\n<|reserved_special_token_200|>noun punct noun noun part noun verb punct aux adv aux verb adp noun pron punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct punct']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [10]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f'results-{new_model}',
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    "<|reserved_special_token_0|>This is really amusing, a radio controlled car that can climb on walls.\n<|reserved_special_token_1|>"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|reserved_special_token_2|>你别那么傻，别让人家欺负你！你要学会保护自己！你要学会保护自己！你要学会保护自己！你要学会保护自己！你要学会保护自己！你要学会保护自己！你要学会保护自己！你要学会保护自己！你']

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).