In [None]:
pip install unsloth transformers trl

In [2]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Llama-3.2-3B-Instruct',
    max_seq_length=2048,
    load_in_4bit=True
)

==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.54.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing = False
)

Unsloth 2025.7.11 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
tokenizer = get_chat_template(tokenizer=tokenizer, chat_template="llama-3.1")

In [6]:
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

In [7]:
dataset = standardize_sharegpt(dataset)

In [8]:
dataset

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 100000
})

In [9]:
dataset[0]

{'conversations': [{'content': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.',
   'role': 'user'},
  {'content': 

In [10]:
dataset = dataset.map(
    lambda examples: {
        "text": [
            tokenizer.apply_chat_template(convo, tokenize=False)
            for convo in examples["conversations"]
        ]
    },
    batched=True,
)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [11]:
dataset

Dataset({
    features: ['conversations', 'source', 'score', 'text'],
    num_rows: 100000
})

In [12]:
training_args = TrainingArguments(
    per_device_train_batch_size=12,
    gradient_accumulation_steps=4,
    max_steps=60,
    warmup_steps=5,
    learning_rate=2e-4,
    fp16 = False,
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    output_dir="output",
    gradient_checkpointing=False, # Disable gradient checkpointing
)

In [13]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    args=training_args
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
!pip install triton==3.2.0 --force-reinstall

In [14]:
import os
os.environ["TRITON_DISABLE_LINE_INFO"] = "1"

In [15]:
!pip freeze | grep triton

triton==3.2.0


In [16]:
!pip freeze | grep torch

torch==2.7.1
torchao==0.10.0
torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
torchdata==0.11.0
torchsummary==1.5.1
torchtune==0.6.1
torchvision==0.22.1


In [17]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 4 x 1) = 48
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)
[34m[1mwandb[0m: Currently logged in as: [33mabeshith[0m ([33mabeshith-dr-m-g-r-educational-and-research-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.5472
2,1.4808
3,1.4807
4,1.4296
5,1.2147
6,1.3689
7,1.2348
8,1.1377
9,1.0726
10,1.0701


TrainOutput(global_step=60, training_loss=1.0040059347947439, metrics={'train_runtime': 3130.6096, 'train_samples_per_second': 0.92, 'train_steps_per_second': 0.019, 'total_flos': 4.889289759451546e+16, 'train_loss': 1.0040059347947439})

In [18]:
model.save_pretrained('finetuned_model')

In [19]:
inference_model, inference_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./finetuned_model",
    max_seq_length = 2048,
    load_in_4bit = True
)

==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.54.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [20]:
text_prompts = [
    "What are the key principles of investments?"
]

for prompt in text_prompts:
    format_prompt = inference_tokenizer.apply_chat_template([{
        "role": "user",
        "content": prompt
        }], tokenize=False)
    model_inputs = inference_tokenizer(format_prompt, return_tensors="pt").to('cuda')

    generated_ids = inference_model.generate(
        **model_inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=inference_tokenizer.pad_token_id,
    )


    response = inference_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)

system

Cutting Knowledge Date: December 2023
Today Date: 30 Jul 2025

user

What are the key principles of investments?assistant

The key principles of investments are:

1. Diversification: Investing in a variety of assets to reduce risk and increase potential returns.
2. Risk management: Assessing and managing the risks associated with investments to minimize potential losses.
3. Time horizon: Considering the length of time an investment will be held to determine the best strategy.
4. Diversification: Investing in a variety of assets to reduce risk and increase potential returns.
5. Risk management: Assessing and managing the risks associated with investments to minimize potential losses.
6. Time horizon: Considering the length of time an investment will be held to determine the best strategy.
7. Liquidity: Considering the ease of selling an investment when needed.
8. Fees: Considering the costs associated with managing an investment, such as management fees and trading fees.
9. Taxa