In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm
    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.7.0+cu126)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./Llama-3.2-3B", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 2080 Ti. Num GPUs = 2. Max memory: 10.747 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [39]:
# Set a distinct pad token
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
    tokenizer.add_special_tokens({'pad_token': '<|pad_token|>'})
    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer))
print(f"Pad token: {tokenizer.pad_token}, EOS token: {tokenizer.eos_token}")

Pad token: <|finetune_right_pad_id|>, EOS token: <|eot_id|>


In [21]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [43]:
# Load and prepare the music dataset
from datasets import load_dataset
try:
    dataset = load_dataset("csv", data_files="music.csv", split="train")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Verify dataset
print(f"Dataset size: {len(dataset)}")
print("First 3 dataset entries:")
for i in range(min(3, len(dataset))):
    print(dataset[i])

Dataset size: 938
First 3 dataset entries:
{'Category': 'Music', 'Subcategory': 'Playback', 'Action': 'Play', 'Sentence': 'Play jazz music in the living room', 'Response': 'Jazz music is now playing in the living room.'}
{'Category': 'Music', 'Subcategory': 'Playback', 'Action': 'Pause', 'Sentence': 'Pause the music', 'Response': 'Music paused.'}
{'Category': 'Music', 'Subcategory': 'Playback', 'Action': 'Resume', 'Sentence': 'Resume the music', 'Response': 'Music resumed.'}


In [44]:
# Standardize dataset to conversation format
def format_music_conversations(examples):
    conversations = []
    for sentence, response in zip(examples["Sentence"], examples["Response"]):
        # Ensure sentence and response are valid strings
        if isinstance(sentence, str) and isinstance(response, str) and sentence.strip() and response.strip():
            convo = [
                {"role": "user", "content": sentence.strip()},
                {"role": "assistant", "content": response.strip()}
            ]
            conversations.append(convo)
        else:
            print(f"Skipping invalid entry: sentence={sentence}, response={response}")
    return {"conversations": conversations}

try:
    dataset = dataset.map(format_music_conversations, batched=True)
except Exception as e:
    print(f"Error formatting conversations: {e}")
    raise

# Verify conversation formatting
print("Sample conversation:", dataset[0]["conversations"])

Sample conversation: [{'content': 'Play jazz music in the living room', 'role': 'user'}, {'content': 'Jazz music is now playing in the living room.', 'role': 'assistant'}]


In [45]:
# Apply formatting to create conversations
dataset = dataset.map(format_music_conversations, batched=True)

Map: 100%|██████████| 938/938 [00:00<00:00, 36609.66 examples/s]


In [46]:
# Apply Llama-3.1 chat template
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

In [47]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    for convo in convos:
        try:
            text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
            texts.append(text)
        except Exception as e:
            print(f"Error formatting conversation: {convo}, Error: {e}")
    return {"text": texts}

try:
    dataset = dataset.map(formatting_prompts_func, batched=True)
except Exception as e:
    print(f"Error applying chat template: {e}")
    raise

# Verify formatted dataset
print("Sample formatted text:", dataset[0]["text"])
dataset = dataset.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 938/938 [00:00<00:00, 19188.97 examples/s]


Sample formatted text: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Play jazz music in the living room<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Jazz music is now playing in the living room.<|eot_id|>


Map: 100%|██████████| 938/938 [00:00<00:00, 18982.41 examples/s]


In [48]:
# Train only on assistant responses
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
# Train the model
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=5,  # Increased to ensure learning
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)
# trainer = train_on_responses_only(
#     trainer,
#     instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
#     response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
# )

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 938/938 [00:01<00:00, 725.92 examples/s]


In [49]:
# Start training
try:
    trainer_stats = trainer.train()
except Exception as e:
    print(f"Training failed: {e}")
    raise

# Show training stats
print(f"Training completed in {trainer_stats.metrics['train_runtime']} seconds")
print(f"Final training loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 938 | Num Epochs = 5 | Total steps = 290
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


Training completed in 301.8061 seconds
Final training loss: 1.0869054614008276e-06


In [50]:
# Show memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")

GPU = NVIDIA GeForce RTX 2080 Ti. Max memory = 10.747 GB.
3.441 GB of memory reserved.
Peak reserved memory = 3.441 GB.
Peak reserved memory for training = 0.0 GB.


In [38]:
# Inference
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
convo = [
    {"role": "user", "content": message["content"]},
    {"role": "assistant", "content": ""}  # Empty assistant message triggers generation
]

inputs = tokenizer.apply_chat_template(
        convo,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=True,
        return_dict=True
    ).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        streamer=text_streamer,
        max_new_tokens=64,
        use_cache=True,
        temperature=0.7,  # Lowered for consistency
        min_p=0.1
    )
print(f"Raw output tokens: {tokenizer.batch_decode(outputs)}")

    # Save LoRA adapters
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

<|eot_id|>
Raw output tokens: ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSkip to the next song<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>']


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
convo = [
    {"role": "user", "content": message["content"]},
    {"role": "assistant", "content": ""}  # Empty assistant message triggers generation
]
inputs = tokenizer.apply_chat_template(
    convo,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    padding=True,
    return_dict=True
).to("cuda")

In [32]:
dataset[5]["conversations"]

[{'content': 'Play the previous song', 'role': 'user'},
 {'content': 'Playing the previous song.', 'role': 'assistant'}]

In [19]:
dataset[5]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nPlay the previous song<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPlaying the previous song.<|eot_id|>'