In [None]:
%%capture

!pip install unsloth

fine tuning model using unsloth, running on colab, note that unsloth requires a different environment so that this notebook doesn't work in this env

In [None]:
import os
import torch
from trl import SFTTrainer
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments, TextStreamer
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from huggingface_hub import login

# hugging face login
login(token="")

In [None]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=False,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
)

Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [None]:
dataset = load_dataset("1312354o/llama-ros2")

README.md:   0%|          | 0.00/437 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/78.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1794 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token

def format_samples_sft(examples):
  text = []
  for instruction, output in zip(examples["instruction"], examples["output"], strict=False):
    message = alpaca_template.format(instruction, output) + EOS_TOKEN
    text.append(message)

  return {"text": text}

dataset = dataset.map(format_samples_sft, batched=True)

Map:   0%|          | 0/1794 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
dataset["train"]

Dataset({
    features: ['instruction', 'output', 'text'],
    num_rows: 1794
})

In [None]:
trainer = SFTTrainer(
    model=model,
   tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        report_to="none",
        seed=0,
    ),
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 132 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 24
 "-____-"     Number of trainable parameters = 22,544,384


Step,Training Loss
1,2.2505
2,2.2472
3,2.2577
4,2.2376
5,2.2044
6,2.0609
7,2.0541
8,2.0284
9,3.41
10,1.8925


TrainOutput(global_step=24, training_loss=2.0064885119597116, metrics={'train_runtime': 199.9901, 'train_samples_per_second': 1.98, 'train_steps_per_second': 0.12, 'total_flos': 4698257208901632.0, 'train_loss': 2.0064885119597116, 'epoch': 2.909090909090909})

In [None]:
FastLanguageModel.for_inference(model)
message = alpaca_template.format("Write a paragraph to introduce ROS2.", "")


In [None]:
inputs = tokenizer([message], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)

<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write a paragraph to introduce ROS2.
### Response:
ROS 2 is a modern and powerful alternative to ROS 1, designed to take advantage of the latest advancements in computer science. It offers a more efficient and scalable architecture, allowing for the development of complex applications that require the integration of multiple services. By leveraging the ROS 2 framework, developers can focus on the core functionality of their robot or vehicle, while also benefiting from the enhanced capabilities and features that come with the ROS 2 ecosystem. This not only enhances the user experience but also provides a robust foundation for building and deploying advanced robotic systems.<|eot_id|>


In [None]:
model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
model.push_to_hub_merged("1312354o/llama-tune", tokenizer, save_method="merged_16bit")


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.5G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 32.45 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 51.41it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: You are pushing to hub, but you passed your HF username = 1312354o.
We shall truncate 1312354o/llama-tune to llama-tune


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 32.39 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 70.95it/s]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/1312354o/llama-tune


In [None]:
model.push_to_hub_gguf("1312354o/llama-tune", tokenizer, quantization_method = "f16", token = "")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 36.37 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 67.35it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at 1312354o/llama-tune into f16 GGUF format.
The output location will be /content/1312354o/llama-tune/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama-tune
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F1

unsloth.F16.gguf:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/1312354o/llama-tune
