# Requirements

In [1]:
pip install unsloth transformers datasets trl bitsandbytes accelerate --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m353.0/353.0 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.5/283.5 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Finetuning Script

In [12]:
import os
import re
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template

# ============================================================
# 1. Load Model
# ============================================================
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
)

# ============================================================
# 2. LoRA setup (robust params)
# ============================================================
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

# ============================================================
# 3. Use ChatML
# ============================================================
tokenizer = get_chat_template(tokenizer, chat_template="chatml")
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

# ============================================================
# 4. Load Dataset
# ============================================================
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

# ============================================================
# 5. Strip BAD TOKENS found in FineTome
# ============================================================
BAD_TOKENS = [
    r"<\|begin_of_text\|>",
    r"<\|end_of_text\|>",
    r"<\|start_header_id\|>",
    r"<\|end_header_id\|>",
    r"<\|eot_id\|>",
    r"<\|assistant\|>",
    r"<\|user\|>",
    r"<\|system\|>",
    r"<\|unspecified\|>",
    r"<\|im_start\|>",
    r"<\|im_end\|>",
]

bad_regex = re.compile("|".join(BAD_TOKENS))

def strip_bad_tokens(example):
    cleaned = []
    for msg in example["conversations"]:
        if isinstance(msg, dict) and "content" in msg:
            msg["content"] = bad_regex.sub("", msg["content"])
        cleaned.append(msg)
    example["conversations"] = cleaned
    return example

dataset = dataset.map(strip_bad_tokens)

# ============================================================
# 6. Unicode + Whitespace Cleaner
# ============================================================
def clean_unicode(example):
    for msg in example["conversations"]:
        if isinstance(msg, dict) and "content" in msg:
            msg["content"] = (
                msg["content"]
                .replace("\x00", "")
                .replace("\u0000", "")
                .replace("\u200b", "")  # zero-width spaces
                .replace("\u200c", "")
                .replace("\u200d", "")
                .replace("\uFEFF", "")
            )
    return example

dataset = dataset.map(clean_unicode)

# ============================================================
# 7. Normalize fine-tome formats (core fix)
# ============================================================
def normalize_messages(conversations):
    msgs = []
    for msg in conversations:

        # role + content (standard)
        if isinstance(msg, dict) and "role" in msg and "content" in msg:
            msgs.append({"role": msg["role"], "content": msg["content"]})

        # from + value (ShareGPT style)
        elif isinstance(msg, dict) and "from" in msg and "value" in msg:
            msgs.append({
                "role": "assistant" if msg["from"] != "human" else "user",
                "content": msg["value"],
            })

        # pure strings
        elif isinstance(msg, str):
            msgs.append({"role": "user", "content": msg})

    return msgs if len(msgs) > 0 else None

# ============================================================
# 8. Format dataset into ChatML strings
# ============================================================
def format_batch(batch):
    out_texts = []

    for conv in batch["conversations"]:
        msgs = normalize_messages(conv)
        if msgs is None:
            out_texts.append("")
            continue

        rendered = tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=True,
        )
        out_texts.append(rendered)

    return {"text": out_texts}

dataset = dataset.map(format_batch, batched=True)

# ============================================================
# 9. Remove invalid rows proactively
# ============================================================
dataset = dataset.filter(lambda x: x["text"] is not None and x["text"].strip() != "")

# Keep only required column
dataset = dataset.remove_columns([c for c in dataset.column_names if c != "text"])

# LAST SAFETY CHECK
print("Final dataset size:", len(dataset))

# ============================================================
# 10. Training
# ============================================================

os.environ["UNSLOTH_DISABLE_TOKEN_CHECKS"] = "1"
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer, # Added tokenizer here
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=60,
        warmup_steps=5,
        learning_rate=2e-4,
        logging_steps=1,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        output_dir="outputs",
    ),
)

trainer.train()

# ============================================================
# 11. Save LoRA
# ============================================================
model.save_pretrained("finetuned_model")
print("Training finished successfully.")

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

Final dataset size: 100000


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makshatm22102[0m ([33makshatm22102-iiit-naya-raipur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.2944
2,1.6342
3,1.1676
4,1.2192
5,1.16
6,1.2241
7,0.7494
8,1.3871
9,1.0956
10,1.135


Training finished successfully.


# SAVING THE MODEL IN GDRIVE

In [13]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [15]:
!cp -r finetuned_model /content/drive/MyDrive/llama32_finetuned/


# LOAD MODEL FOR INFERENCE

In [16]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    "/content/drive/MyDrive/llama32_finetuned",
    max_seq_length=2048,
    load_in_4bit=True,
)

tokenizer.padding_side = "left"

prompt = "Explain quantum entanglement in simple terms."

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Explain quantum entanglement in simple terms. What are the benefits and limitations of entanglement in various fields, and how do researchers explore and harness entanglement? (Approx. 500 words) - Step 1: Define Quantum Entanglement
Quantum entanglement is a phenomenon in which two or more particles become connected in such a way that the state of one particle is instantaneously affected by the state of the other, regardless of the distance between them. This connection is not physical but rather a quantum mechanical one, governed by th

In [None]:
messages = [
    {"role": "user", "content": "Explain transformers in one paragraph."}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [18]:
messages = [
    {"role": "user", "content": "Explain transformers in one paragraph."}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False)

inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True,
    truncation=True
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


system

Cutting Knowledge Date: December 2023
Today Date: 22 Nov 2025

user

Explain transformers in one paragraph.
Transformers are a type of artificial neural network (ANN) used for image and video processing. The term "transformer" was coined by Vasudeva Vasu in 2017. A transformer is a neural network model that can handle long-range dependencies in sequences, unlike recurrent neural networks (RNNs), which are limited to handling shorter sequences. The transformer model consists of an encoder and a decoder. The encoder takes in a sequence (e.g., an image or a video frame) and outputs a sequence of vectors. The decoder then takes the output vectors and generates a new sequence (e.g., the transformed image). The key innovation of the transformer is the self-attention mechanism, which allows the model to weigh the importance of different input vectors when generating the output. This allows the model to capture long-range dependencies in the input sequence more effectively.


# miscellanous testing before finetuning

In [4]:
print("Tokenizer type:", type(tokenizer))
print("Tokenizer is None?", tokenizer is None)


Tokenizer type: <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
Tokenizer is None? False


In [5]:
import random
sample = dataset[random.randint(0, len(dataset)-1)]
print(sample)


{'conversations': [{'content': 'How does coral reef bleaching affect the population dynamics of marine organisms living in and around the reef?', 'role': 'user'}, {'content': "Coral reef bleaching has significant impacts on the population dynamics of marine organisms living in and around the reef. Coral bleaching occurs when corals expel the symbiotic algae (zooxanthellae) living within their tissues due to stress factors such as increased water temperature, pollution, and ocean acidification. This expulsion leads to the loss of the coral's vibrant colors and can eventually result in the death of the coral if the stress persists. The effects of coral bleaching on marine organisms can be categorized into several aspects:\n\n1. Loss of habitat: Coral reefs provide essential habitat, shelter, and breeding grounds for a diverse array of marine organisms, including fish, invertebrates, and other marine life. When coral bleaching occurs, the structural complexity of the reef declines, leadin

In [6]:
print(type(sample["conversations"]))
print(sample["conversations"])


<class 'list'>
[{'content': 'How does coral reef bleaching affect the population dynamics of marine organisms living in and around the reef?', 'role': 'user'}, {'content': "Coral reef bleaching has significant impacts on the population dynamics of marine organisms living in and around the reef. Coral bleaching occurs when corals expel the symbiotic algae (zooxanthellae) living within their tissues due to stress factors such as increased water temperature, pollution, and ocean acidification. This expulsion leads to the loss of the coral's vibrant colors and can eventually result in the death of the coral if the stress persists. The effects of coral bleaching on marine organisms can be categorized into several aspects:\n\n1. Loss of habitat: Coral reefs provide essential habitat, shelter, and breeding grounds for a diverse array of marine organisms, including fish, invertebrates, and other marine life. When coral bleaching occurs, the structural complexity of the reef declines, leading t

In [9]:
bad = dataset.filter(lambda x: x["text"] is None or x["text"].strip() == "")
print(len(bad))


Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

0
