In [20]:
%%html
<audio src="https://oob.alwaysdata.net/silence.mp3" controls autoplay loop></audio>

In [2]:
#!git clone https://github.com/AnupJindal07/gemma-3-finetune-resume-data.git

Cloning into 'gemma-3-finetune-resume-data'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 90 (delta 38), reused 70 (delta 24), pack-reused 0 (from 0)[K
Receiving objects: 100% (90/90), 108.38 KiB | 2.17 MiB/s, done.
Resolving deltas: 100% (38/38), done.


In [3]:
!pip install -q torch transformers sentence-transformers datasets peft accelerate bitsandbytes bs4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
%cd /content/gemma-3-finetune-resume-data
!pwd

/content/gemma-3-finetune-resume-data
/content/gemma-3-finetune-resume-data


In [6]:
#!mkdir -p data/raw_data data/processed_data models/checkpoints

In [5]:
from datetime import datetime

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

In [6]:
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [None]:
model_name = "google/gemma-3-4b-it"
model_checkpoint_path = "models/checkpoints"
fine_tuned_model_name = "fine_tuned_gemma-3-4b-it_resume"
train_version = datetime.now().strftime("%Y%m%d-%H%M%S")    
trained_model_path = f"models/{fine_tuned_model_name}_{train_version}"  

dataset_file_path = "data/datasets/dataset_01.csv"

process_file_path = "data/processed_data/resume_training_data_01.jsonl"

In [8]:
model_name = model_name

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use the EOS token as padding
tokenizer.padding_side = "right"  # Ensure padding is on the right for causal LM

# Apply 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [None]:
print("Loading model...")

In [9]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    dtype=torch.float16,
    quantization_config=quantization_config,
)

# Enable training mode
model.config.use_cache = False

# Prepare model for k-bit training BEFORE applying LoRA
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [10]:
config = LoraConfig(
    task_type="CAUSAL_LM",          # Causal language modeling
    r=8,                                   # Rank of adaptation
    lora_alpha=16,                         # LoRA scaling parameter
    lora_dropout=0.05,                      # LoRA dropout
    target_modules=[                       # Target modules for Gemma-3
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",                           # No bias training
    inference_mode=False                   # Training mode
)

model = get_peft_model(model, config)

# Print LoRA info to verify it's working
model.print_trainable_parameters()

trainable params: 32,788,480 || all params: 4,332,867,952 || trainable%: 0.7567


In [None]:
print(model)

In [11]:
dataset = load_dataset("json", data_files={"train": process_file_path})

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
def tokenize_function(examples):
    input_texts = examples["prompt"]
    output_texts = examples["response"]

    # Format conversations using Gemma-3 IT chat template
    conversations = []
    for prompt, response in zip(input_texts, output_texts):
        # Create conversation in Gemma-3 IT format
        conversation = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response}
        ]
        conversations.append(conversation)

    # Use the tokenizer's chat template to format the conversations
    formatted_texts = []
    for conversation in conversations:
        # Apply chat template
        formatted_text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        formatted_texts.append(formatted_text)

    # Tokenize the formatted conversations
    tokenized = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=2048,
        return_tensors=None
    )

    # Create labels by masking the user prompt tokens
    labels = []
    for i, conversation in enumerate(conversations):
        # Create the user part only to find where assistant response starts
        user_conversation = [{"role": "user", "content": conversation[0]["content"]}]
        user_text = tokenizer.apply_chat_template(
            user_conversation,
            tokenize=False,
            add_generation_prompt=True  # This adds the assistant prompt
        )

        # Tokenize user part to find the boundary
        user_tokens = tokenizer(user_text, add_special_tokens=False)["input_ids"]
        full_tokens = tokenized["input_ids"][i]

        # Create label sequence - start with all masked
        label_seq = [-100] * len(full_tokens)

        # Only learn from the assistant response part
        user_length = len(user_tokens)
        if user_length < len(full_tokens):
            # Copy the assistant response tokens to labels (unmask them)
            for j in range(user_length, len(full_tokens)):
                if full_tokens[j] != tokenizer.pad_token_id:  # Don't learn from padding
                    label_seq[j] = full_tokens[j]

        labels.append(label_seq)

    # Debug: print first example
    if len(labels) > 0:
        non_masked_count = sum(1 for x in labels[0] if x != -100)
        print(f"Debug: First example has {non_masked_count} non-masked tokens to learn from")

    tokenized["labels"] = labels
    return tokenized


In [None]:
# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove the original text columns that are causing issues
tokenized_datasets = tokenized_datasets.remove_columns(["prompt", "response"])

# Split into train and eval datasets (85-15 ratio)
train_eval_split = tokenized_datasets["train"].train_test_split(test_size=0.15, seed=42)
train_dataset = train_eval_split["train"]
eval_dataset = train_eval_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Debug: First example has 49 non-masked tokens to learn from
Debug: First example has 47 non-masked tokens to learn from
Train dataset size: 1440
Eval dataset size: 360


In [14]:
batch_size = 4
gradient_accumulation_steps = 8
num_epochs = 5

In [15]:
# Don't use DataCollatorForLanguageModeling as it interferes with our custom labels
# Use a simple data collator that just handles padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    return_tensors="pt"
)

In [16]:
training_args = TrainingArguments(
    output_dir=model_checkpoint_path,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    learning_rate=1e-4,  # Higher learning rate for LoRA, 2e-4
    weight_decay=0.01,
    #warmup_steps=20,
    warmup_ratio=0.05,
    logging_dir="logs",
    logging_steps=25,

    eval_steps=50,
    eval_strategy="steps",
    save_steps=50,
    save_total_limit=4,
    load_best_model_at_end=True,
    max_grad_norm=1.0,   # Gradient clipping
    #metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False,                      # Use bfloat16 instead
    bf16=True,                       # Better for modern GPUs
    dataloader_pin_memory=False,     # Save memory, Reduce memory pressure
    gradient_checkpointing=True,     # Trade compute for memory
    remove_unused_columns=False,     # Keep all columns including labels
    report_to=None,                  # Disable logging to avoid conflicts
    run_name=f"gemma-3-4b-resume-qlora-{batch_size}bs-{num_epochs}ep",
    seed=42,
    #torch_compile=False,             # Disable torch compile for compatibility
    #dataloader_num_workers=0,        # Single-threaded data loading for stability
)

In [None]:
print("Model loaded and prepared for fine-tuning.")
print(f"Starting training Argument...{training_args}")

In [17]:
from transformers import EarlyStoppingCallback
# Configure a callback to stop training if the evaluation loss
# doesn't improve for 3 consecutive evaluations.
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping]
)


In [19]:
print(f"Started training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
trainer.train()

Started training at 2026-01-23 17:35:14


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manup77jindal[0m ([33manup77jindal-na-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
20,1.1591,0.021149
40,0.0174,0.010057
60,0.0109,0.006455
80,0.0099,0.006342
100,0.0059,0.004727


TrainOutput(global_step=100, training_loss=0.24064412422478199, metrics={'train_runtime': 3279.4875, 'train_samples_per_second': 1.756, 'train_steps_per_second': 0.055, 'total_flos': 1.43795036356608e+17, 'train_loss': 0.24064412422478199, 'epoch': 2.2222222222222223})

In [21]:
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)
print(f"Completed training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Completed training at 2026-01-23 18:30:19


In [59]:
#from huggingface_hub import notebook_login
#notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os, getpass

if 'HF_TOKEN' in os.environ:
    del os.environ['HF_TOKEN']

os.environ["HF_TOKEN"] = getpass.getpass("Enter your NEW Hugging Face WRITE token: ")
print(os.environ["HF_TOKEN"])


In [50]:
finetuned_model = model
finetuned_model_tokenizer=tokenizer

In [None]:
repo_name = f"{fine_tuned_model_name}-v0.3"

# Push the model to the Hub
finetuned_model.push_to_hub(repo_name)

# Push the tokenizer to the Hub
finetuned_model_tokenizer.push_to_hub(repo_name)