In [None]:
!pip uninstall -y xformers


Found existing installation: xformers 0.0.28.post3
Uninstalling xformers-0.0.28.post3:
  Successfully uninstalled xformers-0.0.28.post3


In [None]:
!pip install "xformers[cuda]" --extra-index-url https://download.pytorch.org/whl/cu122


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu122


In [None]:
!python -m xformers.info


xFormers 0.0.28.post3
memory_efficient_attention.ckF:                    unavailable
memory_efficient_attention.ckB:                    unavailable
memory_efficient_attention.ck_decoderF:            unavailable
memory_efficient_attention.ck_splitKF:             unavailable
memory_efficient_attention.cutlassF-pt:            available
memory_efficient_attention.cutlassB-pt:            available
memory_efficient_attention.fa2F@v2.5.7-pt:         available
memory_efficient_attention.fa2B@v2.5.7-pt:         available
memory_efficient_attention.fa3F@0.0.0:             unavailable
memory_efficient_attention.fa3B@0.0.0:             unavailable
memory_efficient_attention.triton_splitKF:         available
indexing.scaled_index_addF:                        unavailable
indexing.scaled_index_addB:                        unavailable
indexing.index_select:                             unavailable
sequence_parallel_fused.write_values:              available
sequence_parallel_fused.wait_values:         

# APRIL ONLY OFFER üéÅ

First we check the GPU version available in the environment and install specific dependencies that are compatible with the detected GPU to prevent version conflicts.

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

Next we need to prepare to load a range of quantized language models, including a new 15 trillion token LLama-3 model, optimized for memory efficiency with 4-bit quantization.


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "MLP-KTLim/llama-3-Korean-Bllossom-8B", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
from google.colab import files
uploaded = files.upload()


Saving flattened_all_scenarios2.json to flattened_all_scenarios2.json




---



Next, we integrate LoRA adapters into our model, which allows us to efficiently update just a fraction of the model's parameters, enhancing training speed and reducing computational load.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

NameError: name 'FastLanguageModel' is not defined

In [None]:
import json

# Load the original nested JSON file
with open('/content/flattened_all_scenarios2.json', 'r', encoding='utf-8') as file:
    nested_data = json.load(file)

# Flatten the nested structure
flattened_data = []
for scenario_list in nested_data:
    for scenario in scenario_list:
        if isinstance(scenario, dict):
            flattened_data.append(scenario)

# Ensure consistency in the fields
for entry in flattened_data:
    if 'input' not in entry:
        entry['input'] = ""
    if 'output' not in entry:
        entry['output'] = ""

# Save the flattened and cleaned data
with open('/content/flattened_all_scenarios2.json', 'w', encoding='utf-8') as file:
    json.dump(flattened_data, file, ensure_ascii=False, indent=2)

print("Flattened JSON saved successfully.")


Flattened JSON saved successfully.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

Then, we define a system prompt that formats tasks into instructions, inputs, and responses, and apply it to a dataset to prepare our inputs and outputs for the model, with an EOS token to signal completion.


In [None]:
EOS_TOKEN = "<|endoftext|>"  # Use an end-of-sequence token to stop generation
base_prompt = """
You are operating a virtual coffee kiosk that receives STT (speech-to-text) inputs from customers placing coffee orders. Your task is to process these inputs, respond in Korean, and generate a JSON output for backend processing.

**Menu Items**:
- Hot Drinks: ÌóàÎ∏åÌã∞ (always hot)
- Iced Only Drinks: ÌÜ†ÎßàÌÜ†Ï£ºÏä§, ÌÇ§ÏúÑÏ£ºÏä§, ÎßùÍ≥†Ïä§Î¨¥Îîî, Îî∏Í∏∞Ïä§Î¨¥Îîî, Î†àÎ™¨ÏóêÏù¥Îìú, Î≥µÏà≠ÏïÑÏïÑÏù¥Ïä§Ìã∞ (always iced)
- Hot/Iced Coffee: ÏïÑÎ©îÎ¶¨Ïπ¥ÎÖ∏, ÎùºÎñº, Ïπ¥Ìë∏ÏπòÎÖ∏, Ïπ¥ÌéòÎ™®Ïπ¥, Î∞îÎãêÎùºÎùºÎñº, ÏóêÏä§ÌîÑÎ†àÏÜå, Ïπ¥ÎùºÎ©úÎßàÎÅºÏïÑÎòê
- Specialty: Ï¥àÏΩúÎ¶øÎùºÎñº (hot or iced)
- Available sizes: ÎØ∏ÎîîÏõÄ ,ÎùºÏßÄ ,ÏóëÏä§ÎùºÏßÄ
- add_oms(options):ÌúòÌïëÌÅ¨Î¶º,Î∞îÎãêÎùºÏãúÎüΩ,Ïπ¥ÎùºÎ©úÏãúÎüΩ,ÏÉ∑

**Default Values**:
- Use default size "ÎØ∏ÎîîÏõÄ" and temperature "Ìï´" if unspecified.
- Do not override explicitly given size or temperature.
**Unavailable Items**:
If the customer requests an item not on the menu, respond politely with "Ï£ÑÏÜ°Ìï©ÎãàÎã§, Ìï¥Îãπ Î©îÎâ¥Îäî ÏóÜÏäµÎãàÎã§."

**Response Requirements**:
1. **Natural Language Response**: Confirm each item in Korean, e.g., "[Drink] [quantity] Ï£ºÎ¨∏ÎêòÏóàÏäµÎãàÎã§.", followed by a full summary of all ordered items, starting with "ÏßÄÍ∏àÍπåÏßÄ Ï£ºÎ¨∏ÌïòÏã† ÎÇ¥Ïö©ÏùÄ Îã§ÏùåÍ≥º Í∞ôÏäµÎãàÎã§:".
2. **JSON Output**: Only include items from the latest input in the structured JSON format below:
   ```json
   {{
       "action": "[action_type]",
       "order_items": [
           {{
               "drink": "[Drink Name]",
               "size": "[Size]",
               "temperature": "[Temperature]",
               "quantity": [Quantity],
               "add_ons": [List of add-ons],
               "extra_shots": [Number of extra shots]
           }}
       ]
   }}
**Available Actions for JSON Output**:
- **create_order**: For new drink orders.
- **add_item**: For adding a new item to the current order.
- **modify_order**: For changing an existing item (e.g., modifying size or temperature).
- **cancel_order**: To remove an order item or reset the order.
- **recommend_closest_item**: If a requested item is unavailable, recommend the closest item.
- **show_order_summary**: Display a summary of all items ordered so far.
- **complete_order**: Finalize the order after confirmation.

**Key Scenarios**:

- New Order: Confirm with a natural response and JSON output for each new drink.
- Modification: Confirm changes and modify JSON.
- Summary Request: Provide a summary without a JSON output.
- Unavailable Items: Recommend a similar item.
- Order Completion: Confirm completion and provide a summary.

if Current Conversation History: {}
What would be the correct Response: ?"""

# Formatting function to adapt dataset examples for training
def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input_text, output_text in zip(inputs, outputs):
        # Use the new detailed base_prompt and format it with the customer input
        formatted_text = base_prompt.format(input_text) + output_text + EOS_TOKEN
        texts.append(formatted_text)
    return {"text": texts}

# Apply the formatting to the dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files="/content/flattened_all_scenarios2.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

<a name="Train"></a>
### Train the model
- We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.
- At this stage, we're configuring our model's training setup, where we define things like batch size and learning rate, to teach our model effectively with the data we have prepared.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 150, # increase this to make the model learn "better"
        num_train_epochs=4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/4542 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.605 GB of memory reserved.


In [None]:
# We're now kicking off the actual training of our model, which will spit out some statistics showing us how well it learns
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,542 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 150
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.0803
2,2.0376
3,2.0772
4,1.8505
5,1.6343
6,1.4951
7,1.2274
8,1.0114
9,0.7936
10,0.6084


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2951.1903 seconds used for training.
49.19 minutes used for training.
Peak reserved memory = 14.219 GB.
Peak reserved memory for training = 8.614 GB.
Peak reserved memory % of max memory = 96.413 %.
Peak reserved memory for training % of max memory = 58.408 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    base_prompt.format(
        "Customer's 1 Input: ÏïÑÏù¥Ïä§ Ïπ¥ÌéòÎùºÎñº ÎùºÏßÄ ÌïúÏûîÏ£ºÏÑ∏Ïöî Customer's 2 Input:ÏïÑÏù¥Ïä§ Ïπ¥ÌéòÎùºÎñº ÎùºÏßÄ ÌïúÏûî ÎåÄÏã† ÏïÑÏù¥Ïä§ ÏïÑÎ©îÎ¶¨Ïπ¥ÎÖ∏ 3Ïûî ÏóëÏä§ÎùºÏßÄ ÏÇ¨Ïù¥Ï¶àÎ°ú Î∞îÍøîÏ£ºÏÑ∏Ïöî", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 150, use_cache = True)
tokenizer.batch_decode(outputs)


NameError: name 'FastLanguageModel' is not defined

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Convert these binary numbers to decimal.", # instruction
        "1010, 1101, 1111", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert these binary numbers to decimal.

### Input:
1010, 1101, 1111

### Response:
The decimal equivalent of 1010 is 10. The decimal equivalent of 1101 is 13. The decimal equivalent of 1111 is 15.<|end_of_text|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
model.push_to_hub("wolf010/lora_model", token = "...") # Online saving

In [None]:
# Save the merged full model to 16-bit precision locally
model.save_pretrained_merged("forth fine_tuned Llama-3.2-3B-Instruct", tokenizer, save_method="merged_16bit")


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 13.52 out of 29.38 RAM for saving.


 34%|‚ñà‚ñà‚ñà‚ñç      | 11/32 [00:00<00:01, 14.26it/s]We will save to Disk and not RAM now.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:26<00:00,  1.21it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [None]:
# Push merged full model to Hugging Face Hub (16-bit)
model.push_to_hub_merged("wolf010/4TH_fine_tuned_Llama-3.2-3B-Instruct", tokenizer, save_method="merged_16bit", token="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")


Unsloth: You are pushing to hub, but you passed your HF username = wolf010.
We shall truncate wolf010/4TH_fine_tuned_Llama-3.2-3B-Instruct to 4TH_fine_tuned_Llama-3.2-3B-Instruct


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 13.35 out of 29.38 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:38<00:00,  1.19s/it]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/wolf010/4TH_fine_tuned_Llama-3.2-3B-Instruct


In [None]:
!git clone https://github.com/ggerganov/llama.cpp



Cloning into 'llama.cpp'...
remote: Enumerating objects: 36466, done.[K
remote: Counting objects: 100% (8144/8144), done.[K
remote: Compressing objects: 100% (309/309), done.[K
remote: Total 36466 (delta 7977), reused 7841 (delta 7835), pack-reused 28322 (from 1)[K
Receiving objects: 100% (36466/36466), 58.24 MiB | 16.27 MiB/s, done.
Resolving deltas: 100% (26629/26629), done.


In [None]:
!pip install -r llama.cpp/requirements.txt


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu


In [None]:
!ls /content/llama.cpp


AUTHORS			       convert_lora_to_gguf.py	Makefile	    README.md
ci			       docs			media		    requirements
cmake			       examples			models		    requirements.txt
CMakeLists.txt		       flake.lock		mypy.ini	    scripts
CMakePresets.json	       flake.nix		Package.swift	    SECURITY.md
common			       ggml			pocs		    spm-headers
CONTRIBUTING.md		       gguf-py			poetry.lock	    src
convert_hf_to_gguf.py	       grammars			prompts		    tests
convert_hf_to_gguf_update.py   include			pyproject.toml
convert_llama_ggml_to_gguf.py  LICENSE			pyrightconfig.json


In [None]:
!python /content/llama.cpp/convert_hf_to_gguf.py /content/4TH_fine_tuned_Llama-3.2-3B-Instruct \
  --outfile /4TH_fine_tuned_llama-3.2-Korean-Bllossom-3B.q4_0.gguf \
  --outtype q8_0

INFO:hf-to-gguf:Loading model: 4TH_fine_tuned_Llama-3.2-3B-Instruct
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> Q8_0, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> Q8_0, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> Q8_0, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> Q8_0, shape = {4096, 1024}
INFO:hf-to-gguf:blk.0.attn_output.weight,    tor

In [None]:
token = "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"

In [None]:
import os
from huggingface_hub import HfApi

api = HfApi()

model_id = "wolf010/2nd_fine_tuned_Llama-3.2-3B-Instruct-gguf"
token = "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"  # Replace with your actual token

# Create a repository on Hugging Face
api.create_repo(model_id, exist_ok=True, repo_type="model", token=token)

# Upload the model file to the repository
api.upload_file(
    path_or_fileobj="/content/2nd_fine_tuned_Llama-3.2-3B-Instruct.gguf",
    path_in_repo="2nd_fine_tuned_Llama-3.2-3B-Instruct.gguf",
    repo_id=model_id,
    token=token,
)


2nd_fine_tuned_Llama-3.2-3B-Instruct.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wolf010/2nd_fine_tuned_Llama-3.2-3B-Instruct-gguf/commit/7e2d57210a63612e6388ff3ce3fd70dd93f866d8', commit_message='Upload 2nd_fine_tuned_Llama-3.2-3B-Instruct.gguf with huggingface_hub', commit_description='', oid='7e2d57210a63612e6388ff3ce3fd70dd93f866d8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

# Load the original PyTorch model
model = LlamaForCausalLM.from_pretrained("/content/4TH_fine_tuned_Llama-3.2-3B-Instruct", from_tf=False)

# Save in HuggingFace format
model.save_pretrained("/content/hf_model")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
!python3 llama.cpp/convert-pth-to-ggml.py --model_path /content/4TH_fine_tuned_Llama-3.2-3B-Instruct --output_file /content/4TH_fine_tuned_Llama-3.2-3B-Instruct.gguf


python3: can't open file '/content/llama.cpp/convert-pth-to-ggml.py': [Errno 2] No such file or directory


In [None]:
!ls /content/


 1st_fine_tuned_Llama-3.2-3B-Instruct	  'forth fine_tuned Llama-3.2-3B-Instruct'
 2nd_fine_tuned_Llama-3.2-3B-Instruct	   huggingface_tokenizers_cache
 3rd_fine_tuned_Llama-3.2-3B-Instruct	   order_dataset2.json
 4TH_fine_tuned_Llama-3.2-3B-Instruct	   order_dataset.json
 all_scenarios2.json			   outputs
'first fine_tuned Llama-3.2-3B-Instruct'   sample_data
 flattened_all_scenarios2.json


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# alpaca_prompt = You MUST run cells from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nOne of the most famous tall towers in Paris is the Eiffel Tower. It is a wrought iron tower located on the Champ de Mars in Paris, France. It was built in 1889 as the entrance to the 1889 World's Fair, and it was designed by the French engineers Gustave Eiff"]

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

We're preparing to save our trained model in a more compact format and then upload it to a cloud platform, which allows us to use less storage and computational power.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

We're ready to compress our model using various quantization methods to make it leaner and then upload it to the cloud for easy sharing and access.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, join their [Discord](https://discord.gg/u54VK8m8tk) channel!