In [1]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")
print(f"GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
CUDA available: True
CUDA version: 12.1
GPU count: 1
GPU name: NVIDIA GeForce RTX 2070


In [2]:
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 2070. Max memory: 8.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{response}
"""

In [5]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instruction = examples['instruction'] #prompts
    input = examples['input'] #context
    response = examples['output'] #target
    texts = []
    for instruction, input, response in zip(instruction, input, response):
        text = alpaca_prompt.format(instruction=instruction, input=input, response=response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("json", data_files="hf://datasets/Vezora/Tested-143k-Python-Alpaca/143k-Tested-Python-Alpaca-Vezora.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 143327/143327 [00:02<00:00, 64985.50 examples/s]


In [6]:
dataset['text']

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nHow can I add a function to find the maximum number among a given set of numbers?\n\n### Input:\n\n\n### Response:\nYou can add the following function in your code to achieve this:\n\n```python\ndef find_max(numbers):\n    max_number = max(numbers)\n    return max_number\n```\n\nThis function takes a list of numbers as input and utilizes the `max()` function in Python to find the maximum among them. It then returns the maximum number.\n\nYou can call this function by passing the numbers you want to evaluate as a list. For example:\n\n```python\nnumbers = [10, 5, 23, 8, 1]\nmaximum = find_max(numbers)\nprint("The maximum number is:", maximum)\n```\n\nIn the above example, the `numbers` list contains five integers. By calling the `find_max()` function with this list as the argument, it will return the maximum 

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bf16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # can make 5x training faster for short sequences
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps= 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = 'linear',
        seed = 3407,
        output_dir = "outputs",
    )
)

Converting train dataset to ChatML: 100%|██████████| 143327/143327 [00:08<00:00, 16996.66 examples/s]
Applying chat template to train dataset: 100%|██████████| 143327/143327 [00:09<00:00, 15014.01 examples/s]
Tokenizing train dataset: 100%|██████████| 143327/143327 [03:35<00:00, 664.84 examples/s]
Truncating train dataset: 100%|██████████| 143327/143327 [01:37<00:00, 1463.47 examples/s]


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 143,327 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.3071
2,0.315
3,0.3177
4,0.2846
5,0.0964
6,0.1422
7,0.059
8,0.0301
9,0.0236
10,0.0227


In [11]:
# Save the full model configuration including adapter weights
from peft import AutoPeftModelForCausalLM

# Get the merged model (base + adapter)
merged_model = model.merge_and_unload()

# Save the complete model
merged_model.save_pretrained("model", safe_serialization=True)
tokenizer.save_pretrained("model")
# if not installed install llama.cpp tools
# pip install llama-cpp-python

# Convert using llama-cpp's converter
# python llama.cpp/convert_hf_to_gguf.py model/ --outfile model/unsloth.F16.gguf --outtype f16



('model\\tokenizer_config.json',
 'model\\special_tokens_map.json',
 'model\\tokenizer.json')