# Install Packages


In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers==0.0.27" trl peft accelerate bitsandbytes

In [None]:
!pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


In [None]:
!pip uninstall -y xformers
!rm -rf /usr/local/lib/python3.10/dist-packages/xformers

Found existing installation: xformers 0.0.27
Uninstalling xformers-0.0.27:
  Successfully uninstalled xformers-0.0.27


In [None]:
!pip install xformers==0.0.27

Collecting xformers==0.0.27
  Using cached xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.3.1 (from xformers==0.0.27)
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.1-

In [None]:
!python -m xformers.info

xFormers 0.0.27
memory_efficient_attention.ckF:                    unavailable
memory_efficient_attention.ckB:                    unavailable
memory_efficient_attention.ck_decoderF:            unavailable
memory_efficient_attention.ck_splitKF:             unavailable
memory_efficient_attention.cutlassF:               available
memory_efficient_attention.cutlassB:               available
memory_efficient_attention.decoderF:               available
memory_efficient_attention.flshattF@v2.5.7:        available
memory_efficient_attention.flshattB@v2.5.7:        available
memory_efficient_attention.smallkF:                available
memory_efficient_attention.smallkB:                available
memory_efficient_attention.triton_splitKF:         unavailable
indexing.scaled_index_addF:                        unavailable
indexing.scaled_index_addB:                        unavailable
indexing.index_select:                             unavailable
sequence_parallel_fused.write_values:              av

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "microsoft/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

# Load Model

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.9.post3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
# Data Prep
We now use the `Phi-3` format for conversation style finetunes. We use [Open Assistant conversations](https://huggingface.co/datasets/philschmid/guanaco-sharegpt-style) in ShareGPT style. Phi-3 renders multi turn conversations like below:

```
<|user|>
Hi!<|end|>
<|assistant|>
Hello! How are you?<|end|>
<|user|>
I'm doing great! And you?<|end|>

```

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
)

def formatting_prompts_func(examples):
    inputs = examples["question"]
    answers = examples["answer"]
    convos = []
    for i in range(len(inputs)):
        convo = [
            {"from": "human", "value": inputs[i]},
            {"from": "gpt", "value": answers[i]},
        ]

        convos.append(convo)

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]

    return  {"conversations": convos,"text" : texts}


from datasets import load_dataset

ds = load_dataset("openai/gsm8k", "main")
dataset = ds.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'conversations', 'text'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer', 'conversations', 'text'],
        num_rows: 1319
    })
})

In [None]:
dataset = dataset['train'].shuffle(seed=42).select(range(3000))

In [None]:
dataset[5]["conversations"]

[{'from': 'human',
  'value': "James decides to build a tin house by collecting 500 tins in a week. On the first day, he collects 50 tins. On the second day, he manages to collect 3 times that number. On the third day, he collects 50 tins fewer than the number he collected on the second day. If he collects an equal number of tins on the remaining days of the week, what's the number of tins he collected each day for the rest of the week?"},
 {'from': 'gpt',
  'value': "On the second day, he collected 3 times the number of tins he collected on the first day, which is 3*50 = <<3*50=150>>150 tins.\nOn the third day, he collected 50 tins fewer than the second day, which is 150-50 = <<150-50=100>>100 tins\nThe total for the three days is 150+100+50 = <<150+100+50=300>>300 tins.\nTo reach his goal, he still needs 500-300 = <<500-300=200>>200 tins.\nSince the total number of days left in the week is 4, he'll need to collect 200/4 = <<200/4=50>>50 tins per day to reach his goal\n#### 50"}]

In [None]:
print(dataset[5]["text"])

<|user|>
James decides to build a tin house by collecting 500 tins in a week. On the first day, he collects 50 tins. On the second day, he manages to collect 3 times that number. On the third day, he collects 50 tins fewer than the number he collected on the second day. If he collects an equal number of tins on the remaining days of the week, what's the number of tins he collected each day for the rest of the week?<|end|>
<|assistant|>
On the second day, he collected 3 times the number of tins he collected on the first day, which is 3*50 = <<3*50=150>>150 tins.
On the third day, he collected 50 tins fewer than the second day, which is 150-50 = <<150-50=100>>100 tins
The total for the three days is 150+100+50 = <<150+100+50=300>>300 tins.
To reach his goal, he still needs 500-300 = <<500-300=200>>200 tins.
Since the total number of days left in the week is 4, he'll need to collect 200/4 = <<200/4=50>>50 tins per day to reach his goal
#### 50<|end|>



In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"


if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

<a name="Train"></a>
# Train the model


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,

    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/3000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.285 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,1.0338
2,0.9883
3,1.0734
4,1.0281
5,0.9948
6,0.8425
7,0.7852
8,0.6881
9,0.7268
10,0.62


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

244.9539 seconds used for training.
4.08 minutes used for training.
Peak reserved memory = 2.73 GB.
Peak reserved memory for training = 0.445 GB.
Peak reserved memory % of max memory = 18.511 %.
Peak reserved memory for training % of max memory = 3.017 %.


<a name="Inference"></a>
# Inference



In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
)

FastLanguageModel.for_inference(model)

messages = [
    {"from": "human", "value": "Artie has a flower stand at the Farmers Market. He sells three kinds of flowers: marigolds, petunias and begonias. He usually sells marigolds for $2.74 per pot, petunias for $1.87 per pot and begonias for $2.12 per pot. Artie has no change today, so he has decided to round all his prices to the nearest dollar. If Artie sells 12 pots of marigolds, 9 pots of petunias and 17 pots of begonias, how much will he make?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)

['<|user|> Artie has a flower stand at the Farmers Market. He sells three kinds of flowers: marigolds, petunias and begonias. He usually sells marigolds for $2.74 per pot, petunias for $1.87 per pot and begonias for $2.12 per pot. Artie has no change today, so he has decided to round all his prices to the nearest dollar. If Artie sells 12 pots of marigolds, 9 pots of petunias and 17 pots of begonias, how much will he make?<|end|><|assistant|> The rounded price for marigolds is $3 per pot.\nThe rounded price for petunias is $2 per pot.\nThe rounded price for begonias is $2 per pot.\nArtie will make $3 x 12 = $<<3*12=36>>36 from marigolds.\nArtie will make $2 x 9 = $<<2*9=18>>18 from petunias.\nArtie will make $2 x 17 = $<<2*17=34>>34 from begonias.\nArtie will make $36 + $18 + $34 = $<<36+18+34=88>>88.\n#### 88<|end|><|endoftext|>']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model)


messages = [
    {"from": "human", "value": "Artie has a flower stand at the Farmers Market. He sells three kinds of flowers: marigolds, petunias and begonias. He usually sells marigolds for $2.74 per pot, petunias for $1.87 per pot and begonias for $2.12 per pot. Artie has no change today, so he has decided to round all his prices to the nearest dollar. If Artie sells 12 pots of marigolds, 9 pots of petunias and 17 pots of begonias, how much will he make?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 256, use_cache = True)

The rounded price for marigolds is $3 per pot.
The rounded price for petunias is $2 per pot.
The rounded price for begonias is $2 per pot.
Artie will make $3 x 12 = $<<3*12=36>>36 from marigolds.
Artie will make $2 x 9 = $<<2*9=18>>18 from petunias.
Artie will make $2 x 17 = $<<2*17=34>>34 from begonias.
Artie will make $36 + $18 + $34 = $<<36+18+34=88>>88.
#### 88<|end|><|endoftext|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

In [None]:
!zip -r /content/lora_model.zip /content/lora_model

  adding: content/lora_model/ (stored 0%)
  adding: content/lora_model/adapter_model.safetensors (deflated 8%)
  adding: content/lora_model/special_tokens_map.json (deflated 76%)
  adding: content/lora_model/tokenizer.json (deflated 85%)
  adding: content/lora_model/tokenizer.model (deflated 55%)
  adding: content/lora_model/tokenizer_config.json (deflated 84%)
  adding: content/lora_model/added_tokens.json (deflated 62%)
  adding: content/lora_model/adapter_config.json (deflated 54%)
  adding: content/lora_model/README.md (deflated 66%)


In [None]:
!zip -r /content/outputs.zip /content/outputs

  adding: content/outputs/ (stored 0%)
  adding: content/outputs/runs/ (stored 0%)
  adding: content/outputs/runs/Sep30_07-00-18_1751fed836f8/ (stored 0%)
  adding: content/outputs/runs/Sep30_07-00-18_1751fed836f8/events.out.tfevents.1727679707.1751fed836f8.688.0 (deflated 66%)
  adding: content/outputs/checkpoint-60/ (stored 0%)
  adding: content/outputs/checkpoint-60/adapter_model.safetensors (deflated 8%)
  adding: content/outputs/checkpoint-60/optimizer.pt (deflated 10%)
  adding: content/outputs/checkpoint-60/trainer_state.json (deflated 81%)
  adding: content/outputs/checkpoint-60/special_tokens_map.json (deflated 76%)
  adding: content/outputs/checkpoint-60/training_args.bin (deflated 51%)
  adding: content/outputs/checkpoint-60/tokenizer.json (deflated 85%)
  adding: content/outputs/checkpoint-60/tokenizer.model (deflated 55%)
  adding: content/outputs/checkpoint-60/rng_state.pth (deflated 25%)
  adding: content/outputs/checkpoint-60/tokenizer_config.json (deflated 84%)
  addin

In [None]:
!zip -r /content/huggingface_tokenizers_cache.zip /content/huggingface_tokenizers_cache

  adding: content/huggingface_tokenizers_cache/ (stored 0%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/ (stored 0%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/refs/ (stored 0%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/refs/main (deflated 5%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/blobs/ (stored 0%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/blobs/2f4cf1e18cb543d31aedc307a6b5968a201569bc (deflated 74%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/blobs/9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 (deflated 55%)
  adding: content/huggingface_tokenizers_cache/models--unsloth--phi-3.5-mini-instruct-bnb-4bit/blobs/72dafda7008a52e087bec2c5f534eda3cfd33b27 (defla