In [None]:
from huggingface_hub import login

HUGGINGFACE_TOKEN = ''

login(token=HUGGINGFACE_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
import os

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer

base_model_id = "google/gemma-3-1b-it"
cache_dir = "./cache"

comet_ml version 3.43.1 is installed, but version 3.43.2 or higher is required. Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=3.43.2'.


In [4]:
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, cache_dir=cache_dir)

base_model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [5]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f'Pad token: {tokenizer.pad_token}')
print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token: <pad>
Pad token id: 0
EOS token: <eos>
EOS token id: 1


In [6]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]
)

In [7]:
peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()
peft_model

trainable params: 1,038,336 || all params: 1,000,924,288 || trainable%: 0.1037


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1152, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1152, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
            

In [8]:
# # dataset = load_dataset("philschmid/dolly-15k-oai-style")
# dataset = load_dataset("MBZUAI/Bactrian-X", "en")

# for split in dataset:
#     dataset[split] = dataset[split].select(range(10))
#     print(f"{split}: {len(dataset[split])}")

# # dataset =

# dataset, dataset["train"][0]

In [9]:
import json
from datasets import Dataset

# Load JSONL into list of dicts
with open("dataset/chatbot/train.jsonl", "r", encoding="utf-8") as f:
    raw_data = [json.loads(line) for line in f]

# Create Hugging Face Dataset
dataset = Dataset.from_list(raw_data)

# Optional: Check a sample
print(dataset[0])

{'Unnamed: 0': 0, 'question': 'hi, how are you doing?', 'answer': "i'm fine. how about yourself?"}


In [10]:
INSTRUCTION_TEMPLATE_WITH_INPUT = """### Instruction:
Chat and fullfil request of user

### Input:
{input}

### Response:
"""

def tokenize_function(examples):
    results = {
        "input_ids": [],
        "labels": [],
        "attention_mask": [],
    }

    for i in range(len(examples['question'])):
        cur_input = examples['question'][i]
        cur_output = examples['answer'][i]

        cur_prompt = INSTRUCTION_TEMPLATE_WITH_INPUT.format(input=cur_input)

        cur_prompt_tokenized = tokenizer(cur_prompt, return_tensors="pt", add_special_tokens=True)
        cur_output_tokenized = tokenizer(cur_output, return_tensors="pt", add_special_tokens=False)

        cur_prompt_ids = cur_prompt_tokenized["input_ids"][0]
        cur_output_ids = cur_output_tokenized["input_ids"][0]
        cur_output_ids = torch.cat([cur_output_ids, torch.tensor([tokenizer.eos_token_id])], dim=0)

        input_ids = torch.cat([cur_prompt_ids, cur_output_ids], dim=0)
        label = torch.cat([torch.full_like(cur_prompt_ids, fill_value=-100), cur_output_ids], dim=0)

        results["input_ids"].append(input_ids)
        results["labels"].append(label)
        results['attention_mask'].append(torch.ones_like(input_ids))

    return results

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)

Map: 100%|██████████| 3225/3225 [00:01<00:00, 2590.31 examples/s]


In [11]:
print(tokenized_dataset[0])
print(tokenizer.decode(tokenized_dataset[0]['input_ids'], skip_special_tokens=False))

{'input_ids': [2, 10354, 64017, 236787, 107, 20535, 532, 2587, 3267, 2864, 529, 2430, 108, 10354, 13065, 236787, 107, 2202, 236764, 1217, 659, 611, 3490, 236881, 108, 10354, 14503, 236787, 107, 236747, 236789, 236757, 5851, 236761, 1217, 1003, 5869, 236881, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 236747, 236789, 236757, 5851, 236761, 1217, 1003, 5869, 236881, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
<bos>### Instruction:
Chat and fullfil request of user

### Input:
hi, how are you doing?

### Response:
i'm fine. how about yourself?<eos>


In [12]:
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=2,
    save_steps=2,
    max_steps=200,
    save_total_limit=2,
    report_to="none",
    push_to_hub=False,
)

In [13]:
from transformers import DataCollatorWithPadding
from typing import Any, Dict, List

class RightPaddingDataCollator(DataCollatorWithPadding):
    """The default data collator pads only inputs, not including the labels."""

    def __init__(self, tokenizer, max_length: int = 1024):
        super().__init__(tokenizer, max_length=max_length)

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_ids, labels, attention_mask = [], [], []
        max_batch_len = max(len(f["input_ids"]) for f in features)

        for sample in features:
            # Convert to torch tensors
            cur_input_ids = torch.tensor(sample["input_ids"], dtype=torch.long)
            cur_labels = torch.tensor(sample["labels"], dtype=torch.long)
            cur_attention_mask = torch.ones_like(cur_input_ids)

            # Next, we pad the inputs and labels to the maximum length within the batch
            pad_token_id = self.tokenizer.pad_token_id
            padding_length = max_batch_len - len(cur_input_ids)
            cur_input_ids = torch.cat([cur_input_ids, torch.full((padding_length,), fill_value=pad_token_id, dtype=torch.long)])
            cur_labels = torch.cat([cur_labels, torch.full((padding_length,), fill_value=-100, dtype=torch.long)])
            cur_attention_mask = torch.cat([cur_attention_mask, torch.zeros((padding_length,), dtype=torch.long)])

            # Truncate the inputs and labels to the maximum length
            cur_input_ids = cur_input_ids[:max_batch_len]
            cur_labels = cur_labels[:max_batch_len]
            cur_attention_mask = cur_attention_mask[:max_batch_len]

            # Append to the return lists
            input_ids.append(cur_input_ids)
            labels.append(cur_labels)
            attention_mask.append(cur_attention_mask)

        # Return formatted batch.
        return {
            "input_ids": torch.stack(input_ids),
            "labels": torch.stack(labels),
            "attention_mask": torch.stack(attention_mask)
        }


data_collator = RightPaddingDataCollator(tokenizer)

In [14]:
data_collator(tokenized_dataset)

{'input_ids': tensor([[    2, 10354, 64017,  ...,     0,     0,     0],
         [    2, 10354, 64017,  ...,     0,     0,     0],
         [    2, 10354, 64017,  ...,     0,     0,     0],
         ...,
         [    2, 10354, 64017,  ...,     0,     0,     0],
         [    2, 10354, 64017,  ...,     0,     0,     0],
         [    2, 10354, 64017,  ...,     0,     0,     0]]),
 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         ...,
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [15]:
from trl import SFTTrainer
from transformers import Trainer

# trainer = Trainer(
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=None,
    data_collator=data_collator
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
2,2.6697
4,2.8592
6,2.4327
8,2.622
10,2.4361
12,1.8349
14,2.0883
16,2.749
18,2.9762
20,2.4956


TrainOutput(global_step=200, training_loss=1.5283658808469773, metrics={'train_runtime': 141.4456, 'train_samples_per_second': 11.312, 'train_steps_per_second': 1.414, 'total_flos': 329919402700800.0, 'train_loss': 1.5283658808469773, 'epoch': 0.49504950495049505})