## SFT Unsloth


In [3]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [72]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.536 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [73]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 256, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = sum(p.numel() for p in model.parameters())

trainable_ratio = trainable_params / total_params * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Trainable Ratio: {trainable_ratio:.2f}%")

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"{name} is trainable")

Trainable Parameters: 180,355,072
Total Parameters: 1,416,169,472
Trainable Ratio: 12.74%


In [48]:
import os
import sys
from datasets import load_from_disk
from datasets import Dataset

src_lng = "English"
tgt_lng = "Luxembourgish"
train_ratio = 0.001
project_root = "/home/snt/projects_lujun/mt_luxembourgish"
val_dataset_path = os.path.abspath(os.path.join(project_root, "data/fake_targets/flores_devtest_arrow"))
train_dataset_path = os.path.abspath(os.path.join(project_root, "data/training_dataset/dataset_GPT_split.jsonl"))
sys.path.append(project_root)

# Load dataset
if train_dataset_path.endswith(".jsonl"):
    dataset = Dataset.from_json(train_dataset_path)  # Ensure correct format
else:
    dataset = load_from_disk(train_dataset_path)

# Filter by split
train_dataset = dataset.filter(lambda x: x["split"] == "train")
val_dataset = dataset.filter(lambda x: x["split"] == "val")


# Select subset
train_dataset = train_dataset.select(range(int(len(train_dataset) * train_ratio)))
val_dataset = val_dataset.select(range(int(len(val_dataset) * train_ratio)))  # Avoid out-of-range error

# Rename columns
train_dataset = train_dataset.rename_columns({
    "input": "Luxembourgish",
    "translated_text": "English",
})

val_dataset = val_dataset.rename_columns({
    "input": "Luxembourgish",
    "translated_text": "English",
})


def create_prompt(sample, src_lng, tgt_lng, mode="train", tokenizer=None):

    if tokenizer is None or tokenizer.eos_token is None:
        raise ValueError("A tokenizer with a defined EOS token is required.")

    system_message = f"You are a helpful AI assistant for translation."
    input_text = sample[src_lng.capitalize()].strip()  # Extract the input text.
    response = ( sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else "")  # Extract the target text.
    question = f"Translate the following English input text into Luxembourgish. Do not include any additional information or unrelated content.\n\n{input_text}"
    # Get the EOS token from the tokenizer.
    eos_token = tokenizer.eos_token
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": question},
        {"role": "assistant", "content": response}
    ]
    
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    full_prompt += f"{eos_token}{eos_token}{eos_token}{eos_token}"  # Add the EOS token to the end of the prompt.
    return { "full_prompt": full_prompt }

train_dataset = train_dataset.map(
    lambda sample: {
        "full_prompt": create_prompt(sample, src_lng=src_lng, tgt_lng=tgt_lng, mode="train", tokenizer=tokenizer)["full_prompt"]
    }
).select_columns(["full_prompt"])

val_dataset = val_dataset.map(
    lambda sample: {
        "full_prompt": create_prompt(sample, src_lng=src_lng, tgt_lng=tgt_lng, mode="train", tokenizer=tokenizer)["full_prompt"]
    }
).select_columns(["full_prompt"])


Map: 100%|██████████| 203/203 [00:00<00:00, 4633.71 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 583.46 examples/s]


In [41]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "full_prompt",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 5,
        per_device_eval_batch_size = 5,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=2): 100%|██████████| 203/203 [00:00<00:00, 956.76 examples/s]
Applying chat template to train dataset (num_proc=2): 100%|██████████| 203/203 [00:01<00:00, 151.87 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 203/203 [00:01<00:00, 142.12 examples/s]
Truncating train dataset (num_proc=2): 100%|██████████| 203/203 [00:00<00:00, 733.98 examples/s]
Converting eval dataset to ChatML (num_proc=2): 100%|██████████| 3/3 [00:00<00:00, 14.05 examples/s]
Applying chat template to eval dataset (num_proc=2): 100%|██████████| 3/3 [00:01<00:00,  2.22 examples/s]
Tokenizing eval dataset (num_proc=2): 100%|██████████| 3/3 [00:01<00:00,  2.24 examples/s]
Truncating eval dataset (num_proc=2): 100%|██████████| 3/3 [00:00<00:00, 14.62 examples/s]


In [42]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Map: 100%|██████████| 203/203 [00:00<00:00, 2509.09 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 493.39 examples/s]


In [43]:
print (tokenizer.decode(trainer.train_dataset[5]["input_ids"]))

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Mar 2025

You are a helpful AI assistant for translation.<|eot_id|><|start_header_id|>user<|end_header_id|>

Translate the following English input text into Luxembourgish. Do not include any additional information or unrelated content.

More than 20 seconds later, Dario Cologna from Switzerland and Ivan Yakimushkin from Russia arrived as 2nd and 3rd. In the overall standings, the Italian Federico Pellegrino remains in 1st place.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Méi wéi 20 Sekonnen drop sinn den Dario Cologna aus der Schwäiz an den Ivan Yakimushkin aus Russland als 2. an 3. ukomm. Am General steet weiderhin den Italiener Federico Pellegrino op der 1.<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>


In [44]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
print (tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]))

                                                                                                                   

Méi wéi 20 Sekonnen drop sinn den Dario Cologna aus der Schwäiz an den Ivan Yakimushkin aus Russland als 2. an 3. ukomm. Am General steet weiderhin den Italiener Federico Pellegrino op der 1.<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>


In [38]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX A6000. Max memory = 47.536 GB.
8.516 GB of memory reserved.


In [39]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 203 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 5 | Gradient Accumulation steps = 4
\        /    Total batch size = 20 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,0.7535
2,0.8733
3,0.8727
4,0.8102
5,0.5407
6,0.2032
7,0.0336
8,0.0164
9,0.012
10,0.0096


KeyboardInterrupt: 