# Finetune

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = HF_TOKEN,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


<a name="Data"></a>
### Data Prep

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
  # ds['train']['dialog'][0]
    # convos = examples["conversations"]
    convos = examples["dialog"]
    texts = [(tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) + EOS_TOKEN) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("zeref713/gsm8k_phi3Form", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Downloading readme:   0%|          | 0.00/514 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/813k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [None]:
dataset[24]["dialog"]

[{'from': 'human',
  'value': 'Mary does her grocery shopping on Saturday. She does her shopping only at a specific store where she is allowed a credit of $100, which must be paid in full before her next shopping trip. That week she spent the full credit limit and paid $15 of it on Tuesday and $23 of it on Thursday. How much credit will Mary need to pay before her next shopping trip?'},
 {'from': 'gpt',
  'value': 'So far, Mary has paid back $15 +$23=$<<15+23=38>>38 of the credit.\nSo she still needs to pay $100-$38=$<<100-38=62>>62\n#### 62'}]

In [None]:
print(dataset[24]["text"])

<bos><|im_start|>user
Mary does her grocery shopping on Saturday. She does her shopping only at a specific store where she is allowed a credit of $100, which must be paid in full before her next shopping trip. That week she spent the full credit limit and paid $15 of it on Tuesday and $23 of it on Thursday. How much credit will Mary need to pay before her next shopping trip?<|im_end|>
<|im_start|>assistant
So far, Mary has paid back $15 +$23=$<<15+23=38>>38 of the credit.
So she still needs to pay $100-$38=$<<100-38=62>>62
#### 62<|im_end|>
<|im_end|>


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        gradient_checkpointing = True,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/7473 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.697 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 934
 "-____-"     Number of trainable parameters = 20,766,720


Step,Training Loss
1,1.7234
2,1.8236
3,1.9061
4,1.8662
5,1.5778
6,1.4071
7,1.435
8,1.2467
9,1.3079
10,1.2479


In [None]:
trainer_stats.metrics

{'train_runtime': 2821.4389,
 'train_samples_per_second': 2.649,
 'train_steps_per_second': 0.331,
 'total_flos': 2.728798313492275e+16,
 'train_loss': 0.8412889753342696,
 'epoch': 0.9997324056729997}

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2821.4389 seconds used for training.
47.02 minutes used for training.
Peak reserved memory = 9.787 GB.
Peak reserved memory for training = 7.09 GB.
Peak reserved memory % of max memory = 66.362 %.
Peak reserved memory for training % of max memory = 48.074 %.


<a name="Inference"></a>
### Inference

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 512, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


['<bos><|im_start|>user\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\nThe next number is 8+3=<<8+3=11>>11\nThe next number is 11+5=<<11+5=16>>16\nThe next number is 16+8=<<16+8=24>>24\n#### 24<|im_end|>']

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Christina is planning a birthday party and needs .75 gift bags per invited guest, because 1/4 of attendees don't show up. She invited 16 friends. Gift bags are $2 each. How much will she spend?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 512, use_cache = True)
tokenizer.batch_decode(outputs)

<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


["<bos><|im_start|>user\nChristina is planning a birthday party and needs .75 gift bags per invited guest, because 1/4 of attendees don't show up. She invited 16 friends. Gift bags are $2 each. How much will she spend?<|im_end|>\n<|im_start|>assistant\nShe needs 16 / 4 = <<16/4=4>>4 gift bags for the guests who don't show up.\nSo she needs 16 + 4 = <<16+4=20>>20 gift bags.\nShe will spend 20 * $2 = $<<20*2=40>>40 on gift bags.\n#### 40<|im_end|>"]

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 512, temperature=1.0 ,use_cache = True)
tokenizer.batch_decode(outputs)

<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


['<bos><|im_start|>user\nJosh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?<|im_end|>\n<|im_start|>assistant\nThe house was worth 80,000+50,000=$<<80000+50000=130000>>130,000 after repairs.\nThe house was worth 130,000*1.5=$<<130000*1.5=195000>>195,000 after repairs.\nHe made a profit of 195,000-130,000=$<<195000-130000=65000>>65,000.\n#### 65000<|im_end|>']

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Which is greater: 9.11 or 9.11? Give your reasons."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 512, temperature=0.1, use_cache = True)
tokenizer.batch_decode(outputs)

<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


['<bos><|im_start|>user\nWhich is greater: 9.11 or 9.11? Give your reasons.<|im_end|>\n<|im_start|>assistant\n9.11 is greater because 1 is greater than 0.1.\n#### 9.11<|im_end|>']

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.push_to_hub("zeref713/gsm8k_2b_gemma2_1", token = HF_TOKEN)
tokenizer.push_to_hub("zeref713/gsm8k_2b_gemma2_1", token = HF_TOKEN)

model.push_to_hub_merged("zeref713/gsm8k_2b_gemma2_2", tokenizer, save_method = "lora", token = HF_TOKEN)

README.md:   0%|          | 0.00/577 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/zeref713/gsm8k_2b_gemma2_1


tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/577 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Saved lora model to https://huggingface.co/zeref713/gsm8k_2b_gemma2_2


### Saving to float16 for VLLM

supports saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4, also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account!

In [None]:
model.push_to_hub_merged("zeref713/gsm8k_2b_gemma2_merged4bit", tokenizer, save_method = "merged_4bit", token = HF_TOKEN)

RuntimeError: Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan
to merge to GGUF or others later on. I suggest you to do this as a final step
if you're planning to do multiple saves.
If you are certain, change `save_method` to `merged_4bit_forced`.

### GGUF / llama.cpp Conversion

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
model.push_to_hub_gguf("zeref713/gsm8k_2b_gemma2_gguf_q4km", tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.79 out of 12.67 RAM for saving.


100%|██████████| 26/26 [00:00<00:00, 31.32it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving zeref713/gsm8k_2b_gemma2_gguf_q4km/pytorch_model-00001-of-00002.bin...
Unsloth: Saving zeref713/gsm8k_2b_gemma2_gguf_q4km/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at zeref713/gsm8k_2b_gemma2_gguf_q4km into f16 GGUF format.
The output location will be ./zeref713/gsm8k_2b_gemma2_gguf_q4km/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: gsm8k_2b_gemma2_gguf_q4km
INFO:gguf.gguf_writer:gguf: This GGUF file is for Littl

RuntimeError: Unsloth: Quantization failed for ./zeref713/gsm8k_2b_gemma2_gguf_q4km/unsloth.F16.gguf
You might have to compile llama.cpp yourself, then run this again.
You do not need to close this Python program. Run the following commands in a new terminal:
You must run this in the same folder as you're saving your model.
git clone --recursive https://github.com/ggerganov/llama.cpp
cd llama.cpp && make clean && make all -j
Once that's done, redo the quantization.

In [None]:
!git clone --recursive https://github.com/ggerganov/llama.cpp
!cd llama.cpp && make clean && make all -j

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPEN

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "zeref713/gsm8k_2b_gemma2_gguf",
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = HF_TOKEN,
    )

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.94 out of 12.67 RAM for saving.


100%|██████████| 26/26 [00:00<00:00, 37.29it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving zeref713/gsm8k_2b_gemma2_gguf/pytorch_model-00001-of-00002.bin...
Unsloth: Saving zeref713/gsm8k_2b_gemma2_gguf/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at zeref713/gsm8k_2b_gemma2_gguf into f16 GGUF format.
The output location will be ./zeref713/gsm8k_2b_gemma2_gguf/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: gsm8k_2b_gemma2_gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endia

RuntimeError: Unsloth: Quantization failed for ./zeref713/gsm8k_2b_gemma2_gguf/unsloth.F16.gguf
You might have to compile llama.cpp yourself, then run this again.
You do not need to close this Python program. Run the following commands in a new terminal:
You must run this in the same folder as you're saving your model.
git clone --recursive https://github.com/ggerganov/llama.cpp
cd llama.cpp && make clean && make all -j
Once that's done, redo the quantization.

# EVALUATION
- not done yet

## Test sets inference

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets

In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
from datasets import load_dataset, Dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="zeref713/gsm8k_2b_gemma2_2",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

from unsloth.chat_templates import get_chat_template

# Apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)

==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [None]:
FastLanguageModel.for_inference(model)
# Function to generate response
def generate_response(value):
    FastLanguageModel.for_inference(model)
    messages = [{"from": "human", "value": value}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=512, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return response[0]

In [None]:
#testing
generate_response("Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'user\nJosh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?\nassistant\nThe house was worth 80,000+50,000=$<<80000+50000=130000>>130,000 after repairs.\nThe house was worth 130,000*1.5=$<<130000*1.5=195000>>195,000 after repairs.\nHe made a profit of 195,000-130,000=$<<195000-130000=65000>>65,000.\n#### 65000'

In [None]:
# Function to generate response for each question in the test dataset
def add_gemma2_responses(batch):
    questions = batch["question"]
    responses = []
    # responses = [generate_response(question) for question in questions]
    for question in questions:
      response = generate_response(question)
      responses.append(response)
      print("RESPONSE: ", response)
      print("--" * 50)
    return {"gemma2_responses": responses}

In [None]:
ds = load_dataset("zeref713/gsm8k_phi3Form")
# test_ds = ds["test"].map(add_gemma2_responses, batched=True, batch_size=2)
test_ds = ds["test"].map(add_gemma2_responses, batched=False)

Downloading readme:   0%|          | 0.00/514 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/813k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [None]:
print(test_ds[5]["question"])
print(test_ds[5]["gemma2_responses"])
print(test_ds[24]["question"])
print(test_ds[24]["gemma2_responses"])

In [None]:
# Now try pushing the dataset again
test_ds.push_to_hub("zeref713/gsm8k_finetuned_gemma2_with_responses_1", token= HF_TOKEN)

In [None]:
test_ds.save_to_disk("gsm8k_gemma2_with_responses")

## EVALUATION
- not done yet