In [1]:
import os
import json
import torch

In [3]:
print(f"Torch version :{torch.__version__}")
print(f"CUDA available :{torch.cuda.is_available}")

Torch version :2.6.0+cu124
CUDA available :<function is_available at 0x78c6d09c9c60>


In [4]:
dataset = [
    {
        "instruction": "Translate the sentence into formal Spanish.",
        "input": "Can you please help me with this task?",
        "Output": "¿Podría ayudarme con esta tarea, por favor?"
    },
    {
        "instruction": "Write a motivational quote for someone feeling lost.",
        "input": "",
        "Output": "Even the darkest night will end, and the sun will rise again. Keep moving forward."
    },
    {
        "instruction": "Rewrite this sentence to sound more professional.",
        "input": "I can't attend the meeting because I have another thing to do.",
        "Output": "I regret to inform you that I won’t be able to attend the meeting due to a prior commitment."
    },
    {
        "instruction": "Give a concise summary of the following text.",
        "input": "The rapid advancement of artificial intelligence has raised both excitement and concerns across industries, especially in automation and ethics.",
        "Output": "AI is advancing rapidly, sparking both enthusiasm and ethical concerns."
    }
]


In [5]:
os.makedirs("data", exist_ok = True)
with open("data/sample.jsonl", "w") as f:
  for ex in dataset:
    f.write(json.dumps(ex) + "\n")

print("Sample Dataset Saved......")

Sample Dataset Saved......


In [1]:
! pip install unsloth



In [2]:
import unsloth
print(unsloth.__version__)
from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
2025.7.2


In [3]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

In [4]:
model_path = "unsloth/mistral-7b-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    lora_alpha = 32,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = True,
)


==((====))==  Unsloth 2025.7.2: Fast Mistral patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [5]:
dataset = load_dataset("json", data_files = "data/sample.jsonl", split = "train")

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
def tokenize_fn(example):
  prompt = f"### instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['Output']}"
  tokenized = tokenizer(
      prompt,
      truncation = True,
      padding = "max_length",
      max_length = 256,
  )
  tokenized["labels"] = tokenized["input_ids"].copy()
  return tokenized

In [7]:
tokenized_dataset = dataset.map(tokenize_fn)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [9]:
! pip install torch



In [11]:
import torch

training_args = TrainingArguments(
    output_dir = "finetuned_model",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 2,
    learning_rate = 2e-4,
    logging_steps = 1,
    num_train_epochs = 1,
    bf16 = torch.cuda.is_bf16_supported(),
    fp16 = not torch.cuda.is_bf16_supported(),
    optim = "adamw_torch",
    lr_scheduler_type = "cosine",
    report_to = "none",
)

In [12]:
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False,
)

In [13]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    data_collator = data_collator,
)

In [14]:
trainer.train()
print("Training Completed ............. !")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 3,407,872 of 7,245,139,968 (0.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.896


Training Completed ............. !
