In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower
dtype = None
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False, # False for LoRA 16bit
    # fast_inference = True, # Enable vLLM fast inference
    # max_lora_rank = lora_rank,
    # gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-25 15:36:34 [__init__.py:244] Automatically detected platform cuda.


2025-06-25 15:36:36,806	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.52.4. vLLM: 0.9.1.
   \\   /|    NVIDIA A30. Num GPUs = 2. Max memory: 23.498 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.6.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
from unsloth.chat_templates import CHAT_TEMPLATES
print(list(CHAT_TEMPLATES.keys()))

['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3', 'qwen-3', 'qwen3']


In [2]:
from unsloth.chat_templates import get_chat_template
chat_template = """"""
with open("/workspace/chat_template.txt", "r") as f:
    chat_template = f.read()

tokenizer.chat_template = chat_template

In [3]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
   return { "text" : texts, }

In [18]:
examples = {
        "conversations": [
            [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Hello, how are you?"},
                {"role": "assistant", "content": "I'm doing well, thank you!"}
            ]
        ]
    }

print(examples)
# print(formatting_prompts_func(examples))

{'conversations': [[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello, how are you?'}, {'role': 'assistant', 'content': "I'm doing well, thank you!"}]]}
{'text': ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nI'm doing well, thank you!<|im_end|>\n"]}


In [None]:
from datasets import load_dataset, DatasetDict

# Load the local JSON file as a dataset
dataset = load_dataset("json", data_files="/workspace/dataset/pika_data.json", split="train")

# Example: formatting function for Unsloth/ChatML
def formatting_prompts_func(examples):
    texts = [tokenizer.apply_chat_template(conv + [{"role": "assistant", "content": examples["assistant_fast_response"][id]}], tokenize=False, add_generation_prompt=False)
             for id,conv in enumerate(examples["previous_conversation"])]
    return {"text": texts}

# Apply formatting (tokenizer must be defined and have chat_template set)
dataset = dataset.map(formatting_prompts_func, batched=True)

In [4]:
dataset[0]

{'previous_conversation': [{'content': 'You are a helpful assistant.',
   'role': 'system'},
  {'content': 'How do I reset my password?', 'role': 'user'}],
 'assistant_fast_response': 'Let me check that for you right away!',
 'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow do I reset my password?<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nLet me check that for you right away!<|im_end|>\n'}

In [7]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A30. Max memory = 23.498 GB.
3.85 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10 | Num Epochs = 12 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 17,432,576/1,738,007,552 (1.00% trained)


AttributeError: 'NoneType' object has no attribute 'attn_bias'

In [10]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu122

Looking in indexes: https://download.pytorch.org/whl/cu122, https://pypi.ngc.nvidia.com
[0m