# LLaMA Factory Colab Tutorial

Please use a **free** Tesla T4 Colab GPU to run this!

Project homepage: https://github.com/hiyouga/LLaMA-Factory

In [None]:
# %%capture
!git clone https://github.com/hiyouga/LLaMA-Factory.git
!pip install /kaggle/working/LLaMA-Factory
!pip install /kaggle/working/LLaMA-Factory[bitsandbytes]
!pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git"
# !mkdir /kaggle/output
# Temporary fix for https://github.com/huggingface/datasets/issues/6753
# !pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

In [2]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory: https://medium.com/mlearning-ai/training-yolov4-on-google-colab-316f8fff99c6")

In [3]:
# Authorize huggingface
!python -c "from kaggle_secrets import UserSecretsClient; from huggingface_hub import HfFolder; hftoken = UserSecretsClient().get_secret('hf-token'); HfFolder().save_token(hftoken)"

## Fine-tune model via Command Line

In [None]:
# gc.collect()
torch.cuda.empty_cache()

In [None]:
# !rm -r /kaggle/working/mistral

In [None]:
import os

# Set environment variables
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ["WANDB_DISABLED"] = "true"

# model and data
model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
# model_name = "unsloth/llama-2-7b-bnb-4bit"
# model_name = "unsloth/gemma-7b-bnb-4bit"
dataset_dir = "/kaggle/input/json-arxiv"
# Train set
# dataset_name = "data_json"
# Train set small
# dataset_name = "train_data_25K_json"
# Test set
dataset_name = "test_data_1K_json"
template = "alpaca"

# Saving HF
output_dir = "/kaggle/working/mistral"
logging_dir = "/kaggle/working/mistral/logs"
hub_model_id = "BrijeshGiri/llm-ds-mistral"

# hyperparams
learning_rate = 3e-4
num_epochs = 1000
warmup_ratio = 1/num_epochs
lr_sched = "cosine"
optim = "adamw_bnb_8bit"

# bs
per_device_train_batch_size = 1 # 1 for gemma
per_device_eval_batch_size = 1 # 1 for gemma
gradient_accumulation_steps = 2
cutoff = 2048 # input token cutoff length
dataloader_num_workers = 4
preprocessing_num_workers = 4
# rope_scaling = "dynamic" # {linear,dynamic,None} - for making normal model work with longer context in a simpler way

# steps
eval_steps = 500
save_steps = 500
logging_steps = 100
save_total_limit = 20

#lora
lora_target = "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"
lora_rank = 8
lora_alpha = 8

# Construct the command using f-string
command = f"""
accelerate launch \
    --num_processes 2 \
    /kaggle/working/LLaMA-Factory/src/train_bash.py \
    --stage sft \
    --do_train \
    --model_name_or_path {model_name} \
    --dataset_dir {dataset_dir} \
    --dataset {dataset_name} \
    --template {template} \
    --finetuning_type lora \
    --lora_target {lora_target} \
    --lora_rank {lora_rank} \
    --lora_alpha {lora_alpha} \
    --output_dir {output_dir} \
    --overwrite_output_dir \
    --overwrite_cache \
    --cutoff_len {cutoff} \
    --preprocessing_num_workers {preprocessing_num_workers} \
    --dataloader_num_workers {dataloader_num_workers} \
    --per_device_train_batch_size {per_device_train_batch_size} \
    --per_device_eval_batch_size {per_device_eval_batch_size} \
    --gradient_accumulation_steps {gradient_accumulation_steps} \
    --auto_find_batch_size \
    --lr_scheduler_type {lr_sched} \
    --warmup_ratio {warmup_ratio} \
    --report_to tensorboard
    --logging_dir {logging_dir} \
    --logging_steps {logging_steps} \
    --save_steps {save_steps} \
    --eval_steps {eval_steps} \
    --save_total_limit {save_total_limit} \
    --evaluation_strategy steps \
    --learning_rate {learning_rate} \
    --optim {optim} \
    --num_train_epochs {num_epochs} \
    --val_size 0.001 \
    --ddp_timeout 180000000 \
    --plot_loss \
    --fp16 \
    --use_unsloth \
    --quantization_bit 4 \
    --push_to_hub \
    --hub_model_id {hub_model_id}\
    --hub_strategy all_checkpoints\
    --load_best_model_at_end \
    --packing True\
"""

# Execute the command
!{command}

  warn(
2024-04-16 02:59:21.404759: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 02:59:21.404825: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 02:59:21.406493: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-16 02:59:21.421546: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 02:59:21.421598: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to registe

### Infer the fine-tuned model

In [4]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)

PyTorch Version: 2.1.0+cu121
CUDA Version: 12.1


In [None]:
# from llmtuner import ChatModel
# chat_model = ChatModel(dict(
#   model_name_or_path="Qwen/Qwen1.5-0.5B-Chat",
#   adapter_name_or_path="test_identity", # output dir of our training
#   finetuning_type="lora",
#   template="qwen", # change to alpaca
# ))
# messages = []
# while True:
#   query = input("\nUser: ")
#   if query.strip() == "exit":
#     break
#   if query.strip() == "clear":
#     messages = []
#     continue

#   messages.append({"role": "user", "content": query})
#   print("Assistant: ", end="", flush=True)
#   response = ""
#   for new_text in chat_model.stream_chat(messages):
#     print(new_text, end="", flush=True)
#     response += new_text
#   print()
#   messages.append({"role": "assistant", "content": response})

### Merge LoRA weights

In [None]:
# from llmtuner import export_model
# export_model(dict(
#   model_name_or_path="Qwen/Qwen1.5-0.5B-Chat",
#   adapter_name_or_path="test_identity",
#   finetuning_type="lora",
#   template="qwen",
#   export_dir="test_exported",
#   # export_hub_model_id="your_hf_id/test_identity",
# ))