In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
import unsloth
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

In [4]:
model_name = "unsloth/tinyllama-bnb-4bit"
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [5]:
lora_r = 16
lora_alpha = 16
lora_dropout = 0
target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

In [6]:
output_dir = "./tinyllama-unsloth-instruction"
num_train_epochs = 1
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
learning_rate = 2e-4
max_steps = -1

In [7]:
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [9]:
dataset[0]

{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.'}

In [10]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = "</s>"

In [11]:
def formatting_prompts_func(examples):
    """
    Format dataset examples into Alpaca-style prompts

    Args:
        examples: Batch of examples from dataset

    Returns:
        Dictionary with formatted text
    """
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []

    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Format with or without input context
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

formatted_dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [12]:
train_dataset = formatted_dataset.shuffle(seed=42).select(range(5000))

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,
    target_modules=target_modules,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=max_seq_length,
)

Unsloth 2025.11.4 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [15]:
training_args = TrainingArguments(
    # Output and logging
    output_dir=output_dir,
    logging_steps=10,
    logging_strategy="steps",
    save_strategy="epoch",
    save_total_limit=1,

    # Training hyperparameters
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_steps=max_steps,

    # Optimization
    learning_rate=learning_rate,
    weight_decay=0.01,
    warmup_steps=100,

    # Performance
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",

    # Miscellaneous
    report_to="none",
    seed=3407,
)

In [16]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

Map (num_proc=2):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
model.config.use_cache = False

In [18]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,000 | Num Epochs = 1 | Total steps = 625
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 12,615,680 of 1,112,664,064 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.9484
20,2.0064
30,2.0617
40,2.0254
50,2.0086
60,1.9872
70,1.9634
80,2.0219
90,2.0436
100,1.9847


Step,Training Loss
10,1.9484
20,2.0064
30,2.0617
40,2.0254
50,2.0086
60,1.9872
70,1.9634
80,2.0219
90,2.0436
100,1.9847


TrainOutput(global_step=625, training_loss=2.0155024551391603, metrics={'train_runtime': 1191.3744, 'train_samples_per_second': 4.197, 'train_steps_per_second': 0.525, 'total_flos': 9973191139540992.0, 'train_loss': 2.0155024551391603, 'epoch': 1.0})

In [19]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./tinyllama-unsloth-instruction/tokenizer_config.json',
 './tinyllama-unsloth-instruction/special_tokens_map.json',
 './tinyllama-unsloth-instruction/tokenizer.model',
 './tinyllama-unsloth-instruction/added_tokens.json',
 './tinyllama-unsloth-instruction/tokenizer.json')

In [20]:
model.save_pretrained_merged(
    f"{output_dir}/model_merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:30<00:00, 30.27s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:38<00:00, 38.28s/it]


Unsloth: Merge process complete. Saved to `/content/tinyllama-unsloth-instruction/model_merged_16bit`


In [21]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Line

In [22]:
test_instructions = [
    "Explain how photosynthesis works in plants.",
    "Write a short story about a robot learning to paint.",
    "List three benefits of regular exercise."
]

In [23]:
for idx, instruction in enumerate(test_instructions, 1):
    print(f"\n{'─'*60}")
    print(f"TEST {idx}: {instruction}")
    print(f"{'─'*60}")

    # Format prompt
    prompt = alpaca_prompt.format(
        instruction,
    )

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length
    ).to("cuda")

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True,
        repetition_penalty=1.15,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and extract response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the response part
    if "### Response:" in generated_text:
        response = generated_text.split("### Response:")[-1].strip()
    else:
        response = generated_text.strip()

    print(f"\n✅ RESPONSE:\n{response}\n")


────────────────────────────────────────────────────────────
TEST 1: Explain how photosynthesis works in plants.
────────────────────────────────────────────────────────────

✅ RESPONSE:
Plants are able to produce their own food through the process of photosynthesis (a.k.a. photosystems). The process of photosynthesis requires two enzymes: chlorophyll and carbon dioxide. Chlorophyll is essential for the process because it allows light energy to be absorbed by chloroplasts, which are the plant's cells responsible for photosynthetic activity. The carbon dioxide that enters the chloroplasts is then split into water and oxygen as part of the process of photosynthesis. The resultant reaction is a reaction that involves a reaction catalyst called Rubisco, which is a protein molecule. The reaction is similar to a chemical reaction and requires the presence of ATP to continue the chain of reactions that take place during this process.


────────────────────────────────────────────────────────