# INSTALL DEPENDENCIES

In [1]:
%%capture

!pip install unsloth # install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth!

# IMPORT MODULES

In [2]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer

from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset
import wandb
from kaggle_secrets import UserSecretsClient


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# LOGIN

In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
secret_value_1 = user_secrets.get_secret("wandb_TOKEN")

login(secret_value_0)

wandb.login(key=secret_value_1) # import wandb
run = wandb.init(
    project='Sentiment_fine-tuning', 
    job_type="training", 
    anonymous="allow")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maadityavaid2004[0m ([33maadityavaid2004-indian-institute-of-technology-kanpur[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# LOAD MODEL AND TOKENIZER

In [5]:
model_name = "meta-llama/Llama-3.2-3B"
max_seq_length = 2048
dtype = None
load_in_4bit= True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    dtype = dtype,
    token = secret_value_0
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

# PROMPT TEMPLATE FOR INFERENCE

In [6]:
# instruction {instruction}
# question {input}
# response {output}
prompt_temp = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
Classify the following product review as positive or negative.

### Question:
{}

### Response:
{}"""

In [7]:
question = """Not a bad one."""
# prompt_temp.format(question, "")
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_temp.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=1200, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Response:")[1])


This is a positive review. It is a good product and the reviewer is satisfied with the product.<|end_of_text|>


In [8]:
question = """No battery life, battery got discharged in 1 hour whereas the company claims it can run for 4 hours straight."""
# prompt_temp.format(question, "")
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_temp.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=1200, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Response:")[1])


Negative

### Explanation:
The battery life is too short. The product is not reliable.<|end_of_text|>


# DATASET PREPARATION FOR FINE-TUNING

In [20]:
dataset_path = "Q-b1t/IMDB-Dataset-of-50K-Movie-Reviews-Backup"
dataset = load_dataset(dataset_path, split="train[:10000]", trust_remote_code = True)

In [21]:
dataset

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 10000
})

# EOS TOKEN

In [11]:
EOS_TOKEN = tokenizer.eos_token  
EOS_TOKEN

'<|end_of_text|>'

# FORMATING PROMPT FUNCTION

In [14]:
def formatting_prompt_func(examples):
    questions = examples["review"]
    responses = examples["sentiment"]

    texts = []
    for question, response in zip(questions, responses):
        text = prompt_temp.format(question, response) + EOS_TOKEN
        texts.append(text)

    return {
        "texts": texts
    }
    

In [22]:
dataset_finetune = dataset.map(formatting_prompt_func, batched = True)
dataset_finetune["texts"][0]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

"Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nClassify the following product review as positive or negative.\n\n### Question:\nOne of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on th

# LORA CONFIGURATIONS

In [16]:
lora_model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules=[  
        "q_proj",   
        "k_proj",   
        "v_proj",   
        "o_proj",   
        "gate_proj",  
        "up_proj",    
        "down_proj",  
    ],
    lora_alpha=16,  
    lora_dropout=0,  
    bias="none",  
    use_gradient_checkpointing="unsloth",  
    random_state=3407,  
    use_rslora=False,  
    loftq_config=None,  
    
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [17]:
lora_model.print_trainable_parameters()

trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


# TRAINER AND TRAINING

In [23]:
trainer = SFTTrainer(
    model = lora_model,
    tokenizer = tokenizer,
    train_dataset = dataset_finetune,
    dataset_text_field = "texts",
    max_seq_length = max_seq_length,
    dataset_num_proc=2,  # Uses 2 CPU threads to speed up data preprocessing

    #set training arguments
    args = TrainingArguments(
        per_device_train_batch_size=2,  
        gradient_accumulation_steps=4,  
        num_train_epochs=1, 
        warmup_steps=5,  
        # max_steps=,  
        learning_rate=2e-4,  
        fp16=not is_bfloat16_supported(),  
        bf16=is_bfloat16_supported(),  
        logging_steps=200,  
        optim="adamw_8bit",  
        weight_decay=0.01,  
        lr_scheduler_type="linear",  
        seed=3407,  
        output_dir="outputs",  
    ),
)

Unsloth: Tokenizing ["texts"] (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [24]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 1,250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Step,Training Loss
200,2.3815
400,2.3593
600,2.3436
800,2.3534
1000,2.3416
1200,2.3431


In [25]:
# Save the fine-tuned model
wandb.finish()

0,1
train/epoch,▁▂▄▅▆██
train/global_step,▁▂▄▅▆██
train/grad_norm,█▁▇▂▁▅
train/learning_rate,█▇▅▄▂▁
train/loss,█▄▁▃▁▁

0,1
total_flos,7.663990168355635e+16
train/epoch,1.0
train/global_step,1250.0
train/grad_norm,0.25309
train/learning_rate,1e-05
train/loss,2.3431
train_loss,2.35275
train_runtime,14538.6448
train_samples_per_second,0.688
train_steps_per_second,0.086


In [26]:
question = """No battery life, battery got discharged in 1 hour whereas the company claims it can run for 4 hours straight."""
# prompt_temp.format(question, "")
FastLanguageModel.for_inference(lora_model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_temp.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = lora_model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=1200, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Response:")[1])


negative<|end_of_text|>


In [27]:
lora_model.push_to_hub("aaditya-vaid/Llama-3.2-3B-Fine-Tuned-IMDB10K")

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/aaditya-vaid/Llama-3.2-3B-Fine-Tuned-IMDB10K
