In [None]:
!pip uninstall -y xformers


[0m

In [None]:
!pip install "xformers[cuda]" --extra-index-url https://download.pytorch.org/whl/cu122


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu122
Collecting xformers[cuda]
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.5.1 (from xformers[cuda])
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->xformers[cuda])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->xformers[cuda])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->xformers[cuda])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1->xformers[cuda])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux201

In [None]:
!python -m xformers.info


xFormers 0.0.28.post3
memory_efficient_attention.ckF:                    unavailable
memory_efficient_attention.ckB:                    unavailable
memory_efficient_attention.ck_decoderF:            unavailable
memory_efficient_attention.ck_splitKF:             unavailable
memory_efficient_attention.cutlassF-pt:            available
memory_efficient_attention.cutlassB-pt:            available
memory_efficient_attention.fa2F@v2.5.7-pt:         available
memory_efficient_attention.fa2B@v2.5.7-pt:         available
memory_efficient_attention.fa3F@0.0.0:             unavailable
memory_efficient_attention.fa3B@0.0.0:             unavailable
memory_efficient_attention.triton_splitKF:         available
indexing.scaled_index_addF:                        unavailable
indexing.scaled_index_addB:                        unavailable
indexing.index_select:                             unavailable
sequence_parallel_fused.write_values:              available
sequence_parallel_fused.wait_values:         

# APRIL ONLY OFFER 🎁

First we check the GPU version available in the environment and install specific dependencies that are compatible with the detected GPU to prevent version conflicts.

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

Next we need to prepare to load a range of quantized language models, including a new 15 trillion token LLama-3 model, optimized for memory efficiency with 4-bit quantization.


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B-Instruct", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
from google.colab import files
uploaded = files.upload()




---



Next, we integrate LoRA adapters into our model, which allows us to efficiently update just a fraction of the model's parameters, enhancing training speed and reducing computational load.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

Then, we define a system prompt that formats tasks into instructions, inputs, and responses, and apply it to a dataset to prepare our inputs and outputs for the model, with an EOS token to signal completion.


In [None]:
EOS_TOKEN = "<|endoftext|>"  # Use an end-of-sequence token to stop generation
base_prompt = """
You are operating a virtual coffee kiosk that receives speech-to-text (STT) inputs from customers placing coffee orders.
Your role is to understand and process these inputs, respond naturally in Korean, and generate a structured JSON file with the correct details for backend processing.

**Key Requirements**:
- **Menu Items**: The kiosk offers the following drinks:
- Hot Drinks: 허브티 (always served hot)
- Iced Only Drinks: 토마토주스, 키위주스, 망고스무디, 딸기스무디, 레몬에이드, 복숭아아이스티 (always served iced)
- Hot and Iced Coffee: 아메리카노, 라떼, 카푸치노, 카페모카, 바닐라라떼, 에스프레소, 카라멜마끼아또
- Specialty Drinks: 초콜릿라떼 (available in both hot and iced versions)
- **Default Values**:
    - Use default size "미디움" and temperature "핫" only if the customer does not specify these details.
- **Do Not Make Assumptions**:
    - If the customer specifies temperature or size, do not override it with defaults. For instance, if they say "아이스 라떼 두잔 주세요", the output should indicate "아이스" without changing it to "핫".
- **Current Conversation History** is a single-line cumulative log of all customer requests so far in this session. starting from 1
**Customer Input and Expected Output Format**:
- Each response should have:
  1. **Natural Language Confirmation**: Respond in Korean, starting with an action confirmation such as "[Drink] [quantity] 주문되었습니다." and
  follow with a full summary of all items ordered so far in the current conversation history up to the last entry, beginning with "지금까지 주문하신 내용은 다음과 같습니다:".
  2. **Structured JSON Output**: Each JSON output should only contain the items directly requested in the latest input, not a full history.

  - **JSON Output** should include only the latest customer input items (from the most recent entry in **Current Conversation History**), not the entire conversation history.
- In your natural language response:
  - Confirm the items in the latest order entry, followed by a summary of all items ordered so far.


**JSON Output Format**:
- The JSON should be structured as follows:
  ```json
  {{
      "action": "[action_type]",
      "order_items": [
          {{
              "drink": "[Drink Name]",
              "size": "[Size]",
              "temperature": "[Temperature]",
              "quantity": [Quantity],
              "add_ons": [List of add-ons if any],
              "extra_shots": [Number of extra shots if any]
          }}
      ]
  }}
  ```
  - **Example JSON Output**:
    ```json
    {{
        "action": "create_order",
        "order_items": [
            {{
                "drink": "아메리카노",
                "size": "미디움",
                "temperature": "핫",
                "quantity": 1,
                "add_ons": [],
                "extra_shots": 0
            }}
        ]
    }}
    ```

**Available Actions for JSON Output**:
- **create_order**: For new drink orders.
- **add_item**: For adding a new item to the current order.
- **modify_order**: For changing an existing item (e.g., modifying size or temperature).
- **cancel_order**: To remove an order item or reset the order.
- **recommend_closest_item**: If a requested item is unavailable, recommend the closest item.
- **show_order_summary**: Display a summary of all items ordered so far.
- **complete_order**: Finalize the order after confirmation.

**Specific Scenarios and Expected Outputs**:
- **Creating a New Order**:
- **Current Conversation History**:
"Customer's 1 Input:아메리카노 4잔 주세요."
**Response**:
- **Natural Language Response**: "아메리카노 4잔 주문되었습니다. 지금까지 주문하신 내용은 다음과 같습니다:
-핫 아메리카노 미디옴 4잔"
- **JSON Output**:
  ```json
  {{
    "action": "create_order",
    "order_items": [
      {{
        "drink": "아메리카노",
        "size": "미디움",
        "temperature": "핫",
        "quantity": 4,
        "add_ons": [],
        "extra_shots": 0
      }}
    ]
  }}
  ```
**Example**:
- **Current Conversation History**:
"Customer's 1 Input: 아메리카노 4잔 주세요. Customer's 2 Input: 카페라떼 라지로 2잔 주세요"
  **Response**:
  - **Natural Language Response**: "카페라떼 라지로 2잔 주문되었습니다. 지금까지 주문하신 내용은 다음과 같습니다:
  - 핫 아메리카노 미디움 4잔,
  - 핫 카페라떼 라지 2잔."
  - **JSON Output**:
    ```json
    {{
      "action": "create_order",
      "order_items": [
        {{
          "drink": "카페라떼",
          "size": "라지",
          "temperature": "핫",
          "quantity": 2,
          "add_ons": [],
          "extra_shots": 0
        }}
      ]
    }}
    ```
- **Requesting Order Summary**:
  - **Current Conversation History**:
  "Customer's 1 Input: 내가 지금까지 뭘 주문했지?"
  **Response**:
  - **Natural Language Response**: "지금까지 주문하신 내용은 다음과 같습니다:
  -핫 아메리카노 미디움 4잔 1샷 추가
  -아이스 카페라떼 라지 2잔 휘핑크림 추가"
  - **JSON Output**: None (as it is just a summary request without any new action).

- **Modifying an Existing Order**:
  - **Current Conversation History**:
  "Customer's 1 Input: "주문한거 아이스 라떼로 바꿔줘."
  **Response**:
  - **Natural Language Response**: "주문이 아메리카노에서 아이스 라떼로 변경되었습니다. 지금까지 주문하신 내용은 다음과 같습니다:
  -아이스 라떼 미디옴 1잔"
  - **JSON Output**:
    ```json
    {{
      "action": "modify_order",
      "old_drink": "아메리카노",
      "new_drink": "라떼",
      "size": "미디움",
      "temperature": "아이스",
      "quantity": 1,
      "add_ons": [],
      "extra_shots": 0
    }}
    ```

- **Short Names or Misspellings**:
  - Recognize common shorthand or misspellings. For example:
    - "아아" should be interpreted as "아이스 아메리카노".
    - "뜨아" should be interpreted as "핫 아메리카노".

- **Unavailable Items**:
  - If the customer requests an item not on the menu, respond politely and recommend a similar item if available.
  - **Example**:
  - **Current Conversation History**:
  "Customer's 1 Input: "초코라떼 주세요."
  **Response**:
    - **Natural Language Response**: "죄송합니다, 초코라떼는 메뉴에 없습니다. 대신 초콜릿라떼를 추천드립니다."
    - **JSON Output**:
      ```json
      {{
        "action": "recommend_closest_item",
        "requested_item": "초코라떼",
        "recommended_item": "초콜릿라떼"
      }}
      ```

- **Order Confirmation**:
  - **Customer Input**: "주문 완료할게요."
  - **Natural Language Response**: "주문이 완료되었습니다. 결제는 카드리더기를 사용해주세요. 감사합니다."
  - **JSON Output**: should include summary of items so far

**Response Rules**:
- Treat each new input as part of the same order until "주문 완료할게" is received, which finalizes the order.
- Always confirm the latest action first in the natural language response, followed by a full order summary.
- Ensure each JSON output reflects only the customer's latest input, not the entire conversation history.
**Current Conversation History**:
{}

**Response**:
"""


# Formatting function to adapt dataset examples for training
def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input_text, output_text in zip(inputs, outputs):
        # Use the new detailed base_prompt and format it with the customer input
        formatted_text = base_prompt.format(input_text) + output_text + EOS_TOKEN
        texts.append(formatted_text)
    return {"text": texts}

# Apply the formatting to the dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files="order_dataset2.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

<a name="Train"></a>
### Train the model
- We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.
- At this stage, we're configuring our model's training setup, where we define things like batch size and learning rate, to teach our model effectively with the data we have prepared.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # increase this to make the model learn "better"
        num_train_epochs=4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# We're now kicking off the actual training of our model, which will spit out some statistics showing us how well it learns
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    base_prompt.format(
        "Customer's 1 Input: 핫 라떼 6잔 줘 Customer's 2 Input: 음료 추천해줘 .", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)
tokenizer.batch_decode(outputs)


['<|begin_of_text|>\nYou are operating a virtual coffee kiosk that receives speech-to-text (STT) inputs from customers placing coffee orders. Your role is to understand and process these inputs, respond naturally in Korean, and generate a structured JSON file with the correct details for backend processing.\n\n**Key Requirements**:\n- **Menu Items**: The kiosk offers the following drinks:\n- Hot Drinks: 허브티 (always served hot)\n- Iced Only Drinks: 토마토주스, 키위주스, 망고스무디, 딸기스무디, 레몬에이드, 복숭아아이스티 (always served iced)\n- Hot and Iced Coffee: 아메리카노, 라떼, 카푸치노, 카페모카, 바닐라라떼, 에스프레소, 카라멜마끼아또\n- Specialty Drinks: 초콜릿라떼 (available in both hot and iced versions)\n- **Default Values**:\n    - Use default size "미디움" and temperature "핫" only if the customer does not specify these details.\n- **Do Not Make Assumptions**:\n    - If the customer specifies temperature or size, do not override it with defaults. For instance, if they say "아이스 라떼 두잔 주세요", the output should indicate "아이스" without changing it to "핫

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Convert these binary numbers to decimal.", # instruction
        "1010, 1101, 1111", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
model.push_to_hub("wolf010/lora_model", token = "...") # Online saving

In [None]:
# Save the merged full model to 16-bit precision locally
model.save_pretrained_merged("second fine_tuned Llama-3.2-3B-Instruct", tokenizer, save_method="merged_16bit")


In [None]:
# Push merged full model to Hugging Face Hub (16-bit)
model.push_to_hub_merged("wolf010/2nd_fine_tuned_Llama-3.2-3B-Instruct", tokenizer, save_method="merged_16bit", token="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")


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# alpaca_prompt = You MUST run cells from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

We're preparing to save our trained model in a more compact format and then upload it to a cloud platform, which allows us to use less storage and computational power.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

We're ready to compress our model using various quantization methods to make it leaner and then upload it to the cloud for easy sharing and access.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, join their [Discord](https://discord.gg/u54VK8m8tk) channel!