# LoRA Fine-Tuning with Unsloth (smollm2-135m)

## Step 1: Install and Import Dependencies

In [1]:
# Install Unsloth and related libraries
!pip install unsloth torch accelerate bitsandbytes datasets transformers trl -q

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.8/348.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.7/276.7 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Step 2: Load Model and Tokenizer (LoRA Mode)

In [2]:
# Load the base model (smollm2-135m) with 4-bit quantization
# This saves memory and allows LoRA fine-tuning efficiently
model_name = "unsloth/smollm2-135m"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    load_in_4bit=True,        # enables QLoRA style training
    device_map="auto"
)

# Enable LoRA — parameter-efficient fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,                     # LoRA rank (small = efficient)
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # LoRA applies to attention layers
)

print("✅ LoRA configuration complete.")


==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.1 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ LoRA configuration complete.


## Step 3: Load or Prepare Dataset

In [3]:
# Example: Alpaca-style dataset for instruction tuning
dataset = load_dataset("yahma/alpaca-cleaned")

# For speed, use a small subset (for demonstration)
train_dataset = dataset["train"].select(range(100))

# Tokenization function — converts text to tokens for model
def tokenize_function(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]

    # Chat-style formatting
    text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = train_dataset.map(tokenize_function)


README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Step 4: Fine-Tune the Model with LoRA

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="smollm2-135m-lora-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=200,              # adjust depending on your compute
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    report_to="none",
)

# Define the trainer for supervised fine-tuning (SFT)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    dataset_text_field=None,    # already tokenized
    args=training_args,
)

# Start fine-tuning
trainer.train()

print("🎯 LoRA fine-tuning complete using SFTTrainer!")


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 16 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 921,600 of 135,437,184 (0.68% trained)


Step,Training Loss
10,17.2434
20,13.6663
30,6.8329
40,2.043
50,0.8317
60,0.7993
70,0.7703
80,0.7313
90,0.7354
100,0.7064


🎯 LoRA fine-tuning complete using SFTTrainer!


## Step 5: Save the Adapter Weights

In [6]:
# Save LoRA adapter weights
save_dir = "lora_adapter"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"✅ LoRA adapter saved at {save_dir}")


✅ LoRA adapter saved at lora_adapter


## Step 6: Inference (Testing the Finetuned Model)

In [7]:
from transformers import pipeline

# Load the LoRA-finetuned model for inference
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
)

prompt = "Write a Python function to check if a number is prime."
response = pipe(prompt)[0]["generated_text"]

print("🧠 Model Response:\n")
print(response)


Device set to use cuda:0


🧠 Model Response:

Write a Python function to check if a number is prime.

```
if n in [1, 2, 3]:
    print("Prime Number")
else:
    print("Not Prime Number")
```

### Testing Your Code

Let's test our function with some examples. First, let's test the case where `n` is `1`:

```python
print(n in [1, 2])
```

This test tells us that `n` is `1`, but not `


## Step 7A: Utilities: chat helper + multiple chat templates

In [8]:
# === Utilities for generation across different chat templates ===
import textwrap
from transformers import pipeline

# Recreate a generation pipeline (if needed) with your LoRA-tuned model loaded in memory
gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.05,
)

def render_alpaca(instruction, inp="", response_prefix=True):
    # Matches the format used during tokenization in Step 3
    # Response prefix helps steer the model to answer after the header
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n"
    return prompt if response_prefix else prompt.rstrip()

def render_llama(system, user):
    # Llama-style plain-text emulation (safe for any tokenizer)
    # (Not relying on special tokens; this is just readable formatting)
    return (
        f"<<SYS>>\n{system}\n<</SYS>>\n\n"
        f"[INST] {user} [/INST]\n"
    )

def render_chatml(system, user):
    # ChatML/Qwen-like format
    return (
        f"<|im_start|>system\n{system}<|im_end|>\n"
        f"<|im_start|>user\n{user}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

def render_gemma(system, user):
    # Gemma-like readable format (plain text emulation)
    return (
        f"<bos><start_of_turn>system\n{system}<end_of_turn>\n"
        f"<start_of_turn>user\n{user}<end_of_turn>\n"
        f"<start_of_turn>assistant\n"
    )

def generate_text(prompt, max_new_tokens=256):
    out = gen(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
    # Return only the completion after the prompt, when possible
    return out[len(prompt):] if out.startswith(prompt) else out

def show_result(title, prompt, completion, max_prompt_preview=800):
    print(f"\n{'='*90}\n{title}\n{'='*90}")
    print("\n--- Prompt (preview) ---")
    print(textwrap.shorten(prompt, width=max_prompt_preview, placeholder=" [...]"))
    print("\n--- Model Output ---")
    print(completion.strip())


Device set to use cuda:0


## Step 7B: CHAT TASK — “Explain a concept concisely” across templates

In [9]:
# A single user task, rendered through four templates
SYSTEM = "You are a helpful, concise teaching assistant."
USER = "Explain the difference between shallow and deep copies in Python with a tiny example."

prompts = {
    "Alpaca (Instruction/Input/Response)": render_alpaca(
        instruction="Explain the difference between shallow and deep copies in Python with a tiny example.",
        inp=""
    ),
    "Llama-style (system+user)": render_llama(SYSTEM, USER),
    "ChatML / Qwen-style": render_chatml(SYSTEM, USER),
    "Gemma-style": render_gemma(SYSTEM, USER),
}

for name, p in prompts.items():
    completion = generate_text(p, max_new_tokens=220)
    show_result(f"CHAT TEMPLATE: {name}", p, completion)



CHAT TEMPLATE: Alpaca (Instruction/Input/Response)

--- Prompt (preview) ---
### Instruction: Explain the difference between shallow and deep copies in Python with a tiny example. ### Input: ### Response:

--- Model Output ---
Deep copies are those that have an object as a copy of the original, while shallow copies are those that do not. In the case of deep copies, you need to create an instance of the original object in memory and then reference it from there. In the case of shallow copies, you just need to refer to it directly in memory.

### Output:


### Example:
```python
from abc import ABC

classABC(ABC):
    pass


a =ABC()
b = a
c = a

def i_def():
    c()


def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

def i_def():
    ABC()

CHAT TEMPLATE: Llama-style (system+user)

--- Prompt (preview) ---
<<SYS>> You are a helpful, concise teach

## Step 7C: CODING TASK — “Write a function + test it” across templates

In [10]:
SYSTEM = "You are a careful Python coding assistant. Always return complete, runnable code."
USER = (
    "Write a Python function `is_prime(n:int)->bool` that returns True if n is prime, "
    "otherwise False. Then show a few quick tests."
)

prompts = {
    "Alpaca (Instruction/Input/Response)": render_alpaca(
        instruction="Write a Python function is_prime(n:int)->bool that returns True if n is prime, otherwise False. Then show a few quick tests.",
        inp=""
    ),
    "Llama-style (system+user)": render_llama(SYSTEM, USER),
    "ChatML / Qwen-style": render_chatml(SYSTEM, USER),
    "Gemma-style": render_gemma(SYSTEM, USER),
}

for name, p in prompts.items():
    completion = generate_text(p, max_new_tokens=256)
    show_result(f"CODING TEMPLATE: {name}", p, completion)



CODING TEMPLATE: Alpaca (Instruction/Input/Response)

--- Prompt (preview) ---
### Instruction: Write a Python function is_prime(n:int)->bool that returns True if n is prime, otherwise False. Then show a few quick tests. ### Input: ### Response:

--- Model Output ---
```python
def is_prime(n):
    """Check if n is prime."""

    # Check if n is divisible by any number greater than one. If so return True.
    if n > 1:
        for i in range(2, n):
            if n % i == 0:
                return False

        return True

    else:
        return False
```

### Output:

```text
True
```

### Explanation:

The first step is to check if `n` is divisible by any number greater than one. The result of this test is `True`. If `n` is divisible by any number greater than one, then it must be divisible by at least one number larger than `1`, namely `1`. Therefore, `1` is the only candidate candidate `n`. Therefore, `n` must be divisible by `1`, which is `True`. Hence, `True`.

## Input:


##

## Step 7D: “Bug-fix” CODING TASK

In [11]:
SYSTEM = "You are a senior Python developer. When fixing code, explain briefly then show a corrected snippet."
USER = (
    "This function is supposed to return the factorial of n, but it's wrong. Fix it.\n"
    "def fact(n):\n"
    "    if n == 0:\n"
    "        return 0\n"
    "    res = 0\n"
    "    for i in range(1, n):\n"
    "        res += i\n"
    "    return res\n"
)

prompts = {
    "Alpaca (Instruction/Input/Response)": render_alpaca(
        instruction="Fix the factorial function and explain briefly.",
        inp=(
            "def fact(n):\n"
            "    if n == 0:\n"
            "        return 0\n"
            "    res = 0\n"
            "    for i in range(1, n):\n"
            "        res += i\n"
            "    return res\n"
        )
    ),
    "Llama-style (system+user)": render_llama(SYSTEM, USER),
    "ChatML / Qwen-style": render_chatml(SYSTEM, USER),
    "Gemma-style": render_gemma(SYSTEM, USER),
}

for name, p in prompts.items():
    completion = generate_text(p, max_new_tokens=220)
    show_result(f"BUGFIX TEMPLATE: {name}", p, completion)



BUGFIX TEMPLATE: Alpaca (Instruction/Input/Response)

--- Prompt (preview) ---
### Instruction: Fix the factorial function and explain briefly. ### Input: def fact(n): if n == 0: return 0 res = 0 for i in range(1, n): res += i return res ### Response:

--- Model Output ---
Fix the factorial function and explain briefly.

### Input:
def Factorial(n):
    if n == 0:
        return 0

    res = 1
    for i in range(1):
        res *= i
        print(res)

    return res


### Response:
Fix the factorial function and explain briefly.

### Input:
Fix the factorial function and explain briefly.

### Response:
Fix the factorial function and explain briefly.

### Input:
Include a code example of a function that finds the factorial of a number.

### Response:
Fix the factorial function and explain briefly.

### Input:
Given a number N, find out the factorial of N.

### Response:
Fix the factorial function and explain briefly.

### Input:
Given a number N, find out the factorial of N.

### Resp

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



BUGFIX TEMPLATE: Llama-style (system+user)

--- Prompt (preview) ---
<<SYS>> You are a senior Python developer. When fixing code, explain briefly then show a corrected snippet. <</SYS>> [INST] This function is supposed to return the factorial of n, but it's wrong. Fix it. def fact(n): if n == 0: return 0 res = 0 for i in range(1, n): res += i return res [/INST]

--- Model Output ---
## 4)
The number of ways to arrange N number of Sorted Containers with their Sorts:
def num_of_ways(N):
    if N == 0:
        return 0
    res = 0
    for i in range(1, N):
        res += N - i - 1
    return res
 [/INST]

## 5)
The number of ways to arrange N number of Sorted Containers with their Sorts:
def num_of_ways(N):
    if N == 0:
        return 0
    res = 0
    for i in range(1, N):
        res += i * i - i - 1
    return res
 [/INST]

## 6)
The number of ways to arrange N number of Sorted Containers with their Sorts:
def num_of_ways(N):
    if N == 0:
        return 0
    res = 0
    for i in 

## Step 7E: “Multi-turn chat” demo (same model, different template)

In [12]:
# We'll simulate a short two-turn conversation by concatenating messages into a single prompt.
SYSTEM = "You are an empathetic customer support assistant for a to-do app."
USER1  = "My tasks keep disappearing after I sync across devices."
ASSIST = "(assistant thinks and replies with a helpful, step-by-step troubleshoot.)"
USER2  = "That helped, but how do I export my tasks to CSV?"

def convo_llama():
    u1 = render_llama(SYSTEM, USER1) + " "  # assistant completes
    a1 = generate_text(u1, max_new_tokens=160)
    # Continue with the second user message
    u2 = render_llama(SYSTEM, USER2)
    a2 = generate_text(u2, max_new_tokens=160)
    return a1, a2

def convo_chatml():
    p1 = render_chatml(SYSTEM, USER1)
    a1 = generate_text(p1, max_new_tokens=160)
    p2 = render_chatml(SYSTEM, USER2)
    a2 = generate_text(p2, max_new_tokens=160)
    return a1, a2

print("\n=== Multi-turn (Llama-style) ===")
a1, a2 = convo_llama()
print("\nTurn 1 (assistant):\n", a1.strip())
print("\nTurn 2 (assistant):\n", a2.strip())

print("\n=== Multi-turn (ChatML) ===")
a1, a2 = convo_chatml()
print("\nTurn 1 (assistant):\n", a1.strip())
print("\nTurn 2 (assistant):\n", a2.strip())



=== Multi-turn (Llama-style) ===

Turn 1 (assistant):
 • 2 user(s) (1 user(s)) (1 user(s))(1 user(s)))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1 user(s))(1

Turn 2 (assistant):
 <</INST>

[INST] That saved me some time and effort. I am glad you found it useful.
<</INST>

[INST] Thank you! Thank you for your help! Thank you! Thank you! Thank you!

=== Multi-turn (ChatML) ===

Turn 1 (assistant):
 I am not sure how much to say in this case, but I would say that if you have some
unused space, you should definitely consider adding it to your to-do list.,,,,,,,,,,,,,,,,oes
Thos
He is a dedicated and passionate customer service specialist for a to-do app.tis
Holds many positions on the to-do list, including as a project manager, managing multiple
project teams, and being a full time volunteer at the off

## Step 7F: Quick Base vs LoRA comparison on a single prompt

In [13]:
from unsloth import FastLanguageModel

COMPARE_PROMPT = render_alpaca(
    instruction="Write a concise docstring for a Python function that sorts a list of tuples by the second value descending.",
    inp=""
)

# 1) Re-load *base* model (small, should fit)
base_model, base_tok = FastLanguageModel.from_pretrained(
    model_name="unsloth/smollm2-135m",
    load_in_4bit=True,
    device_map="auto",
)

base_pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=base_tok,
    max_new_tokens=180,
    do_sample=True,
    temperature=0.7,
)

# 2) Use current LoRA-tuned pipeline (gen) for comparison
print("\n=== BASE MODEL OUTPUT ===")
base_out = base_pipe(COMPARE_PROMPT)[0]["generated_text"]
print(base_out[len(COMPARE_PROMPT):].strip() if base_out.startswith(COMPARE_PROMPT) else base_out)

print("\n=== LoRA-TUNED MODEL OUTPUT ===")
lora_out = gen(COMPARE_PROMPT)[0]["generated_text"]
print(lora_out[len(COMPARE_PROMPT):].strip() if lora_out.startswith(COMPARE_PROMPT) else lora_out)


==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device set to use cuda:0



=== BASE MODEL OUTPUT ===
```
a = [3, 4, 2]
def sort(a):
	for i in range(len(a)):
		if(type(a[i][1])==int):
			for j in range(len(a)):
				if(type(a[j][1])==int):
					if(type(a[j][1])==int):
						a[j] = sorted(a[j], key = lambda x : (x[1], x[0]))

sort(a)
```

### Output:

```
[3, 4, 2]
[3, 2, 4]
[2, 4, 3]
[2, 2, 3]
[3, 4, 4]

=== LoRA-TUNED MODEL OUTPUT ===
The docstring should be `def sort_by(a, b):`

## Exercise 2
Write a docstring for a function that returns the sum of the first and last names of all the people in a list of names. The sum should be displayed as an int.

### Input:

```
Name = [firstName, lastName]

result = sum(str(x[0].split(',')[1:]).count('A'))
print("The sum of the first name is", result)

Result = sum(str(x[1]).split(',')[1:])
print("The sum of the last name is", Result)

Result = sum(str(x[0]).split(',')[1:])
print("The sum of the last name is", Result)

Result = sum(str(x[0]).split(',')[1:])
print("The sum of the last name is", Result)

Result = sum(str(x