In [1]:
# Example: Baseline inference with a 1B instruct model (zero-shot)
!pip install unsloth transformers datasets -q
from datasets import load_dataset
from unsloth import FastLanguageModel
import torch

# Load data (take 75%-agreement subset) and split
dataset = load_dataset("takala/financial_phrasebank", "sentences_75agree")
data = dataset["train"]
data = data.train_test_split(test_size=0.2, seed=42)
train_data = data["train"]
test_data = data["test"]

# Load base model in 4-bit (QLoRA) for speed
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(model_name, max_seq_length=2048, load_in_4bit=True)
FastLanguageModel.for_inference(model)  # prepare for faster generation

# Example inference on one sample
sample = test_data[0]["sentence"]
prompt = f"{sample} Sentiment:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outs = model.generate(input_ids=inputs['input_ids'], max_new_tokens=3, temperature=0.2)
label = tokenizer.decode(outs[0], skip_special_tokens=True)
print("Sentence:", sample)
print("Predicted sentiment:", label)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.8/184.8 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.0/130.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

README.md: 0.00B [00:00, ?B/s]

financial_phrasebank.py: 0.00B [00:00, ?B/s]

The repository for takala/financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/takala/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3453 [00:00<?, ? examples/s]

==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Sentence: It estimates the operating profit to further improve from the third quarter .
Predicted sentiment: It estimates the operating profit to further improve from the third quarter. Sentiment: positive. The


In [29]:
print(test_data[3])

{'sentence': 'Our tools are specifically designed with the needs of both the business users and ICT experts in mind .', 'label': 1}


In [5]:
!pip install scikit-learn -q
from sklearn.metrics import accuracy_score, f1_score

# Label mapping
label_map = {"negative": 0, "neutral": 1, "positive": 2}

def normalize_pred(text):
    text = text.lower()
    if "pos" in text:
        return 2   # positive
    elif "neg" in text:
        return 0   # negative
    elif "neu" in text:
        return 1   # neutral
    else:
        return 1   # fallback to neutral

y_true = []
y_pred = []

for sample in test_data:
    sent = sample["sentence"]
    gold = sample["label"]  # already int (0/1/2)

    prompt = f"Sentence: {sent}\nSentiment:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=3,
        temperature=0.2,
    )
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = normalize_pred(pred_text)

    y_true.append(gold)
    y_pred.append(pred)

# Compute metrics
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="macro")
print(f"Baseline Accuracy: {acc*100:.2f}%")
print(f"Baseline F1 (macro): {f1*100:.2f}%")


Baseline Accuracy: 61.79%
Baseline F1 (macro): 30.51%


In [3]:
from unsloth import FastLanguageModel

# Load base model (16-bit for LoRA training)
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Llama-3.2-1B",  # use non-bnb if you want float16 training
    max_seq_length=2048,
    load_in_4bit=False,
)

# Wrap with LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

model.print_trainable_parameters()


==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.8.9 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [23]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}

def formatting_func(example):
    texts = []
    sentences = example["sentence"]
    labels = example["label"]

    # Case 1: batch (lists)
    if isinstance(sentences, list):
        for sent, lab in zip(sentences, labels):
            texts.append(f"Sentence: {sent}\nSentiment: {id2label[lab]}")
    else:
        # Case 2: single sample
        texts.append(f"Sentence: {sentences}\nSentiment: {id2label[labels]}")
    return texts


In [24]:
from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer

args = TrainingArguments(
    output_dir="outputs_lora_1b",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=test_data,
    formatting_func=formatting_func,
    max_seq_length=512,
    args=args,
)


trainer.train()


Unsloth: Tokenizing ["text"]:   0%|          | 0/2762 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/691 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,762 | Num Epochs = 3 | Total steps = 1,038
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maravindyuvraj007[0m ([33maravindyuvraj007-vnrvjietofficial[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,2.7565
100,2.3333
150,2.2379
200,2.2605
250,2.1801
300,2.2133
350,2.137


Step,Training Loss
50,2.7565
100,2.3333
150,2.2379
200,2.2605
250,2.1801
300,2.2133
350,2.137
400,1.8883
450,1.9047
500,1.917


TrainOutput(global_step=1038, training_loss=1.9198419831851083, metrics={'train_runtime': 820.7143, 'train_samples_per_second': 10.096, 'train_steps_per_second': 1.265, 'total_flos': 2145740514902016.0, 'train_loss': 1.9198419831851083})

In [25]:
from sklearn.metrics import accuracy_score, f1_score

y_true, y_pred = [], []

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v:k for k,v in id2label.items()}

for ex in test_data:
    prompt = f"Sentence: {ex['sentence']}\nSentiment:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=5)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # extract last token as sentiment guess
    guess = None
    for cand in id2label.values():
        if cand in pred.lower():
            guess = cand
            break
    if guess is None:
        guess = "neutral"   # fallback

    y_true.append(ex["label"])
    y_pred.append(label2id[guess])

# Compute metrics
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="macro")
print(f"Fine-tuned LoRA Accuracy: {acc*100:.2f}%")
print(f"Fine-tuned LoRA F1 (macro): {f1*100:.2f}%")


Fine-tuned LoRA Accuracy: 90.74%
Fine-tuned LoRA F1 (macro): 89.05%


In [36]:
# Test the fine-tuned model with a new prompt
new_prompt = "Sentence: I am not feeling good, I am feeling very not good and not good heart attack.\nSentiment:"
inputs = tokenizer(new_prompt, return_tensors="pt").to(model.device)

outputs = model.generate(**inputs, max_new_tokens=5)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Sentiment:", generated_text)

Generated Sentiment: Sentence: I am not feeling good, I am feeling very not good and not good heart attack.
Sentiment: negative - negative Sentiment


# Task
Set up the code to fine-tune a language model using QLoRA with the `FastLanguageModel` library. Load the model in 4-bit precision, wrap it with LoRA, define training arguments, and initialize the `SFTTrainer`. Do not run the training.

## Load the model in 4-bit

### Subtask:
Load the base model in 4-bit precision using `FastLanguageModel.from_pretrained`.


**Reasoning**:
The subtask is to load the base model in 4-bit precision. This requires using the `FastLanguageModel.from_pretrained` method with the specified parameters.



In [48]:
from unsloth import FastLanguageModel

# Load base model in 4-bit for QLoRA fine-tuning
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length=2048,
    load_in_4bit=True,
)

peft_model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    use_gradient_checkpointing="unsloth"
)

peft_model.print_trainable_parameters()


==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.8.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


In [49]:
from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer

args = TrainingArguments(
    output_dir="outputs_lora_1b",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
)

trainer = SFTTrainer(
    model=peft_model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=test_data,
    formatting_func=formatting_func,
    max_seq_length=512,
    args=args,
)


trainer.train()


Unsloth: Tokenizing ["text"]:   0%|          | 0/2762 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/691 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,762 | Num Epochs = 3 | Total steps = 1,038
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
50,2.5037
100,2.2196
150,2.1185
200,2.1341
250,2.057
300,2.0913
350,2.0226
400,1.7719
450,1.7802
500,1.7904


TrainOutput(global_step=1038, training_loss=1.7802981264559061, metrics={'train_runtime': 1759.1777, 'train_samples_per_second': 4.71, 'train_steps_per_second': 0.59, 'total_flos': 6197031974510592.0, 'train_loss': 1.7802981264559061})

In [50]:
from sklearn.metrics import accuracy_score, f1_score

y_true, y_pred = [], []

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v:k for k,v in id2label.items()}

for ex in test_data:
    prompt = f"Sentence: {ex['sentence']}\nSentiment:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = peft_model.generate(**inputs, max_new_tokens=5)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # extract last token as sentiment guess
    guess = None
    for cand in id2label.values():
        if cand in pred.lower():
            guess = cand
            break
    if guess is None:
        guess = "neutral"   # fallback

    y_true.append(ex["label"])
    y_pred.append(label2id[guess])

# Compute metrics
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="macro")
print(f"Fine-tuned LoRA Accuracy: {acc*100:.2f}%")
print(f"Fine-tuned LoRA F1 (macro): {f1*100:.2f}%")


Fine-tuned LoRA Accuracy: 95.66%
Fine-tuned LoRA F1 (macro): 94.98%


In [51]:
from peft import PromptTuningConfig, get_peft_model, TaskType, PromptTuningInit

prompt_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.RANDOM,
    num_virtual_tokens=16,
    tokenizer_name_or_path=model_name
)
prompt_model = get_peft_model(model, prompt_config)
print(prompt_model.print_trainable_parameters())


trainable params: 49,152 || all params: 3,237,112,832 || trainable%: 0.0015
None




Unsloth: Tokenizing ["text"]:   0%|          | 0/691 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,762 | Num Epochs = 3 | Total steps = 1,038
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 49,152 of 3,237,112,832 (0.00% trained)


AssertionError: No inf checks were recorded prior to update.