# Installing Dependencies

In [1]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install -e .[torch,bitsandbytes]

/content
Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 362, done.[K
remote: Counting objects: 100% (362/362), done.[K
remote: Compressing objects: 100% (276/276), done.[K
remote: Total 362 (delta 79), reused 314 (delta 72), pack-reused 0 (from 0)[K
Receiving objects: 100% (362/362), 9.95 MiB | 23.48 MiB/s, done.
Resolving deltas: 100% (79/79), done.
/content/LLaMA-Factory
[0m[01;34massets[0m/       [01;34mevaluation[0m/  MANIFEST.in     requirements.txt  [01;34mtests[0m/
CITATION.cff  [01;34mexamples[0m/    pyproject.toml  [01;34mscripts[0m/
[01;34mdata[0m/         LICENSE      README.md       setup.py
[01;34mdocker[0m/       Makefile     README_zh.md    [01;34msrc[0m/
Obtaining file:///content/LLaMA-Factory
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l

# Updating Identity Dataset

In [1]:
import json

%cd /content/LLaMA-Factory/

NAME = "Llama-3"
AUTHOR = "LLaMA Factory"

with open("data/identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

for sample in dataset:
  sample["output"] = sample["output"].replace("{{"+ "name" + "}}", NAME).replace("{{"+ "author" + "}}", AUTHOR)

with open("data/identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)

/content/LLaMA-Factory


# Fine-tuning via Command Line

### LoRA plus

In [7]:
import json

args = dict(
  stage="sft",                                               # do supervised fine-tuning
  do_train=True,
  # model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  model_name_or_path="unsloth/llama-3.2-1b",                   # use bnb-4bit-quantized Llama-3.2-1B model
  dataset="identity,alpaca_en_demo",                         # use alpaca and identity datasets
  template="llama3",                                         # use llama3 prompt template
  finetuning_type="lora",                                    # use LoRA adapters to save memory
  lora_target="all",                                         # attach LoRA adapters to all linear layers
  output_dir="llama3.2_lora",                                  # the path to save LoRA adapters
  per_device_train_batch_size=2,                             # the micro batch size
  gradient_accumulation_steps=4,                             # the gradient accumulation steps
  lr_scheduler_type="cosine",                                # use cosine learning rate scheduler
  logging_steps=5,                                           # log every 5 steps
  warmup_ratio=0.1,                                          # use warmup scheduler
  save_steps=1000,                                           # save checkpoint every 1000 steps
  learning_rate=5e-5,                                        # the learning rate
  num_train_epochs=3.0,                                      # the epochs of training
  max_samples=500,                                           # use 500 examples in each dataset
  max_grad_norm=1.0,                                         # clip gradient norm to 1.0
  loraplus_lr_ratio=16.0,                                    # use LoRA+ algorithm with lambda=16.0
  fp16=True,                                                 # use float16 mixed precision training
  report_to="wandb",                                         # enable wandb logging
  run_name="llama_factory_run1"
)

json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3.json

/content/LLaMA-Factory
2025-07-10 06:30:04.721685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752129004.742536    6707 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752129004.748691    6707 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-10 06:30:04.770827: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[INFO|2025-07-10 06:30:12] llamafactory.hparams.parser:410 >> Process rank: 0, world size: 1, device: 

## Freeze Training

In [12]:
import json

args = dict(
  stage="sft",                                               # do supervised fine-tuning
  do_train= True,
  model_name_or_path="unsloth/llama-3.2-1b",                   # use bnb-4bit-quantized Llama-3.2-1B model
  finetuning_type= "freeze",
  freeze_trainable_layers= 2,
  freeze_trainable_modules= "all",
  dataset="identity,alpaca_en_demo",                         # use alpaca and identity datasets
  template="llama3",                                         # use llama3 prompt template
  output_dir= "outputs/freeze",
  per_device_train_batch_size= 2,
  gradient_accumulation_steps=4,                             # the gradient accumulation steps
  num_train_epochs= 3,
  lr_scheduler_type="cosine",                                # use cosine learning rate scheduler
  learning_rate=5e-5,                                        # the learning rate
  warmup_ratio=0.1,                                          # use warmup scheduler
  fp16= True,
  logging_steps= 5,
  save_steps= 1000,
  max_samples=500,                                           # use 500 examples in each dataset
  max_grad_norm=1.0,                                         # clip gradient norm to 1.0
  plot_loss= True,
  overwrite_cache= True,
  report_to= "wandb",
  run_name="llama_factory_run2_freeze"
)

json.dump(args, open("train_llama3_freeze.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3_freeze.json

/content/LLaMA-Factory
2025-07-10 06:53:50.730187: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752130430.751585   12720 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752130430.757907   12720 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-10 06:53:50.779305: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[INFO|2025-07-10 06:53:58] llamafactory.hparams.parser:410 >> Process rank: 0, world size: 1, device: 

# Base Arguments

In [16]:
base_args = dict(
  stage="sft",
  do_train=True,
  model_name_or_path="unsloth/llama-3.2-1b",
  dataset="identity,alpaca_en_demo",
  template="llama3",
  # output_dir="",  # ← Will be different per method
  per_device_train_batch_size=2,
  gradient_accumulation_steps=4,
  lr_scheduler_type="cosine",
  logging_steps=5,
  warmup_ratio=0.1,
  save_steps=1000,
  learning_rate=5e-5,
  num_train_epochs=3.0,
  max_samples=500,
  max_grad_norm=1.0,
  fp16=True,
  report_to="wandb",

)

## DoRA

In [19]:
dora_args = dict(
  **base_args,
  finetuning_type="lora",
  lora_target="all",
  use_dora=True,
  output_dir="llama3.2_dora",
  run_name="llama_factory_run3_dora"
)

json.dump(dora_args, open("train_llama3_DoRA.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3_DoRA.json

/content/LLaMA-Factory
2025-07-10 07:09:18.152772: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752131358.172612   16684 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752131358.178610   16684 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-10 07:09:18.198834: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[INFO|2025-07-10 07:09:26] llamafactory.hparams.parser:410 >> Process rank: 0, world size: 1, device: 

## PiSSA

In [22]:
pissa_args = dict(
  **base_args,
  finetuning_type="lora",
  lora_target="all",
  pissa_init=True,
  pissa_iter=16,
  output_dir="llama3.2_pissa",
  run_name="llama_factory_run4_pissa"
)

json.dump(pissa_args, open("train_llama3_PiSSA.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3_PiSSA.json

/content/LLaMA-Factory
2025-07-10 07:29:27.282713: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752132567.302867   21822 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752132567.309008   21822 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-10 07:29:27.329387: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[INFO|2025-07-10 07:29:34] llamafactory.hparams.parser:410 >> Process rank: 0, world size: 1, device: 