In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model_path_python_trained="../models/finetuned-python"
resume_checkpoint="../models/finetuned-python/checkpoint-9834"

In [4]:
dataset = load_dataset("json", data_files="../processed-data/ds_coder_instruct.jsonl", split="train")

In [5]:
model, tokenizer=FastLanguageModel.from_pretrained(
    model_name=resume_checkpoint,
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:16<00:00,  5.44s/it]


../models/base_model does not have a padding token! Will use pad_token = <unk>.


Unsloth 2025.6.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
tokenizer.pad_token=tokenizer.eos_token

In [7]:
print(tokenizer)

LlamaTokenizerFast(name_or_path='../models/finetuned-python/checkpoint-9834', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [8]:
model=FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [13]:
train_test = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

In [10]:
training_args=TrainingArguments(
    output_dir="../models/finetuned-python-datascience",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_steps=20,
    logging_steps=10,
    fp16=True,
    bf16=False,
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="../logs",
    report_to="none",
    lr_scheduler_type="cosine",
    eval_strategy="epoch",
    resume_from_checkpoint=resume_checkpoint
)

In [11]:
def formatting_func(example):
    return example["prompt"]+example["response"]

In [14]:
def format_dataset(example):
    return {"text": example["prompt"] + example["response"]}

train_dataset = train_dataset.map(format_dataset)
eval_dataset=eval_dataset.map(format_dataset)

In [15]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args
)

In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,445 | Num Epochs = 3 | Total steps = 12,336
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 39,976,960/3,540,389,888 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.7545,0.678653
2,0.6548,0.665377
3,0.5466,0.689359


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=12336, training_loss=0.6201682481359105, metrics={'train_runtime': 75544.1294, 'train_samples_per_second': 0.653, 'train_steps_per_second': 0.163, 'total_flos': 1.1255287869277471e+18, 'train_loss': 0.6201682481359105})