In [None]:
%%capture
import os

!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer wandb
!pip install --no-deps unsloth

In [None]:
from google.colab import userdata
import os

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN") # Create a token at https://huggingface.co/settings/tokens
os.environ["WANDB_ORG_NAME"] = userdata.get("WANDB_ORG_NAME") # Create an organization here https://wandb.ai
os.environ["WANDB_KEY"] = userdata.get("WANDB_KEY") # Create a key here https://wandb.ai

In [None]:
from unsloth import FastModel
import torch
import wandb


model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
    token = os.environ["HF_TOKEN"]
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.11: Fast Gemma3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

In [None]:
wandb.login(key=os.environ["WANDB_KEY"])

run = wandb.init(
    entity=os.environ["WANDB_ORG_NAME"],
    project="Gemma-3-FT-LoRA",
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpramodevice[0m ([33mpramodevice-university-of-moratuwa-department-of-archite[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = 8,
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("lukepramo221/choreo_concepts_docs_qna_trainset_conversation_formatted_pqt_v1.0", split = "train", token=os.environ["HF_TOKEN"])


Dataset.parquet:   0%|          | 0.00/138k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)
dataset[100]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/970 [00:00<?, ? examples/s]

{'conversations': [{'content': 'What is the nomenclature for a service hosted on the Choreo platform, and what mechanism allows for its identification?',
   'role': 'user'},
  {'content': 'Services made available via the Choreo platform are referred to as Choreo services, and their unique identity is established by an endpoint residing within a Choreo service component.',
   'role': 'assistant'}],
 'source': 'docs\\choreo-concepts\\choreo-marketplace.md/Add a Choreo service',
 'score': 5}

In [None]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)
dataset[100]["text"]


Map:   0%|          | 0/970 [00:00<?, ? examples/s]

'<start_of_turn>user\nWhat is the nomenclature for a service hosted on the Choreo platform, and what mechanism allows for its identification?<end_of_turn>\n<start_of_turn>model\nServices made available via the Choreo platform are referred to as Choreo services, and their unique identity is established by an endpoint residing within a Choreo service component.<end_of_turn>\n'

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 20,
        # max_steps = 10,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb",
        dataset_num_proc=2,
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")


'                               Services made available via the Choreo platform are referred to as Choreo services, and their unique identity is established by an endpoint residing within a Choreo service component.<end_of_turn>\n'

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.59 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 970 | Num Epochs = 10 | Total steps = 1,220
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,901,248 of 4,000,000,000 (0.37% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,7.3163
2,5.8676
3,4.5393
4,4.5536
5,3.4501
6,4.0539
7,3.1936
8,2.9081
9,2.6123
10,2.5702


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

4407.731 seconds used for training.
73.46 minutes used for training.
Peak reserved memory = 5.59 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 37.921 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "How can I promote my app to production in Choreo?",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 250,
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.


["<bos><start_of_turn>user\nHow can I promote my app to production in Choreo?<end_of_turn>\n<start_of_turn>model\nTo promote your app to production, simply click the 'Deploy to Production' button found within the project's 'Manage → Deploy' area.<end_of_turn>"]

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "How can I find a service in Choreo marketplace?",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 250,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

You can find a service in the Choreo marketplace by accessing https://console.choreo.dev/marketplace/, proactively searching for it using the search bar, or by refining your search via various filtering options provided.<end_of_turn>


In [None]:
# This script is for the manual operation of Fine tuning workloads. Make sure to do proper versioning here.


model.save_pretrained("choreo-qna-finetuned-gemma-3-v0.7")
tokenizer.save_pretrained("choreo-qna-finetuned-gemma-3-v0.7")
model.push_to_hub("lukepramo221/choreo-qna-finetuned-gemma-3-qna-v0.7", token=os.environ["HF_TOKEN"])
tokenizer.push_to_hub("lukepramo221/choreo-qna-finetuned-gemma-3-qna-v0.7", token=os.environ["HF_TOKEN"])


Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/gemma-3-4b-it-unsloth-bnb-4bit.


README.md:   0%|          | 0.00/603 [00:00<?, ?B/s]


Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/gemma-3-4b-it-unsloth-bnb-4bit.


  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/59.7M [00:00<?, ?B/s]

Saved model to https://huggingface.co/lukepramo221/choreo-qna-finetuned-gemma-3-qna-v0.6


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]