In [None]:
!pip install -qqq "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --progress-bar off
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton --progress-bar off

import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.71.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.3 which is incompatible.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.
tensorflow-metadata 1.16.1 requires protobuf<6.0.0dev,>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0m


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth.chat_templates import get_chat_template


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# Load model
max_seq_length = 2048  # Maximum sequence length the model can handle (2048 tokens)
model, tokenizer = FastLanguageModel.from_pretrained(
   model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Pre-quantized 8B parameter Llama 3.1 model
   max_seq_length=max_seq_length,  # Set the model context length to 2048 tokens
   load_in_4bit=True,  # Enable 4-bit quantization for reduced memory usage (75% less than 16-bit)
   dtype=None,  # Let the library automatically determine appropriate data type
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Apply Parameter-Efficient Fine-Tuning (PEFT) using LoRA
model = FastLanguageModel.get_peft_model(
   model,  # The base model to apply PEFT to
   r=16,  # Rank of the low-rank matrices - higher values can capture more complex adaptations
   lora_alpha=16,  # Scaling factor for LoRA updates, affects how much influence LoRA has on original weights
   lora_dropout=0,  # No dropout applied to LoRA layers (higher values help prevent overfitting)
   target_modules=[
       "q_proj",    # Query projection in attention mechanism
       "k_proj",    # Key projection in attention mechanism
       "v_proj",    # Value projection in attention mechanism
       "up_proj",   # Upward projection in feed-forward network
       "down_proj", # Downward projection in feed-forward network
       "o_proj",    # Output projection in attention mechanism
       "gate_proj"  # Gate projection for SwiGLU activation
   ],  # Specific attention and feed-forward components to fine-tune
   use_rslora=True,  # Enables Rank-Stabilized LoRA, which normalizes weights to prevent rank collapse
   use_gradient_checkpointing="unsloth"  # Memory optimization technique that trades computation for memory
                                        # by recomputing some activations during backward pass instead of storing them
)

In [None]:
tokenizer = get_chat_template(
   tokenizer,                        # The tokenizer object from the previously loaded model
   chat_template="chatml",           # Using the ChatML format for conversations (used by models like Llama 3)
   mapping={                         # Maps standard role names to model-specific format:
       "role": "from",              #   - Maps the standard "role" field to ChatML's "from" field
       "content": "value",          #   - Maps the standard "content" field to ChatML's "value" field
       "user": "human",             #   - Maps the user role to "human" in the ChatML format
       "assistant": "gpt",          #   - Maps the assistant role to "gpt" in the ChatML format
   }
)

Unsloth: Will map <|im_end|> to EOS = <|end_of_text|>.


In [None]:
dataset = load_dataset(
   "mlabonne/FineTome-100k",    # The dataset identifier on Hugging Face Hub
   split="train[:10000]"          # Load only the first 200 examples from the training split
)

In [None]:
def apply_template(examples):
   messages = examples["conversations"]  # Extract the conversations field from the dataset examples
   text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
   # Convert each conversation to the chat template format without tokenizing or adding generation prompts
   return {"text": text}  # Return the formatted text in a dictionary

dataset = dataset.map(apply_template, batched=True)
# Apply the formatting function to the entire dataset in batches for efficiency

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,                              # The model to be fine-tuned
    tokenizer=tokenizer,                      # Keep tokenizer here as it's paired with DataCollator
    train_dataset=dataset,                    # The dataset to use for training
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),  # Handle text processing
    args=SFTConfig(
        learning_rate=3e-4,                   # Learning rate for optimization (0.0003)
        lr_scheduler_type="linear",           # Linear learning rate decay schedule
        per_device_train_batch_size=4,        # Number of samples per batch on each GPU
        gradient_accumulation_steps=4,        # Accumulate gradients over multiple steps
        num_train_epochs=1,                   # Train for 1 complete pass through the dataset
        fp16=not is_bfloat16_supported(),     # Use FP16 precision if BF16 is not supported
        bf16=is_bfloat16_supported(),         # Use BF16 precision if supported by hardware
        logging_steps=1,                      # Log metrics after every step
        optim="adamw_8bit",                   # Use 8-bit AdamW optimizer for memory efficiency
        weight_decay=0.01,                    # L2 regularization to prevent overfitting
        warmup_steps=10,                      # Gradually increase learning rate for first 10 steps
        output_dir="output",                  # Directory to save model checkpoints
        seed=0,                               # Random seed for reproducibility
        max_seq_length=max_seq_length,        # Maximum sequence length (moved into SFTConfig)
        dataset_num_proc=2,                   # Number of processors (moved into SFTConfig)
        packing=True,                         # Packing parameter (moved into SFTConfig)
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 625
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,0.9088
2,0.7918
3,0.8917
4,0.6795
5,0.769


In [None]:
# Load model for inference
model = FastLanguageModel.for_inference(model)  # Prepare the fine-tuned model for inference

messages = [
   {"from": "human", "value": "Is 9.11 larger than 9.9?"},  # Message in ChatML format
]

inputs = tokenizer.apply_chat_template(
   messages,                      # The conversation messages to format
   tokenize=True,                 # Convert the text to token IDs
   add_generation_prompt=True,    # Add a prompt to indicate where model generation should begin
   return_tensors="pt",           # Return PyTorch tensors
).to("cuda")                       # Move tensors to GPU for faster inference

## 4. Inference

We test the trained model with a toy example to check that there's no obvious error.

In [None]:
text_streamer = TextStreamer(tokenizer)  # Initialize a text streamer for token-by-token output
_ = model.generate(
   input_ids=inputs,            # The tokenized input prompt
   streamer=text_streamer,      # Use the text streamer to display tokens as they're generated
   max_new_tokens=128,          # Generate up to 128 new tokens
   use_cache=True               # Use KV caching to speed up generation
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|>user
Is 9.11 larger than 9.9?<|im_end|>
<|im_start|>assistant
Yes, 9.11 is larger than 9.9. To compare two numbers, we can use the greater than symbol (>), which means "is greater than." Since 9.11 is greater than 9.9, we can say that 9.11 is larger than 9.9.<|im_end|>


In [None]:
model.push_to_hub_merged(
   "EdMarcavage/MyFirst-Llama-3.1-8B",  # Destination repo on Hugging Face Hub
   tokenizer,                        # The tokenizer to save with the model
   save_method="merged_16bit"        # Save the merged model (base + LoRA weights) in 16-bit precision
)