In [None]:
!pip install transformers datasets accelerate peft trl bitsandbytes




In [None]:
from huggingface_hub import login
from google.colab import userdata

# Set HF_TOKEN in Colab secrets first
login(token=userdata.get("HF_TOKEN"))  # Add before model loading


In [None]:
import os
import transformers
import torch
from datasets import load_dataset, Dataset, DatasetDict
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

model_id = "google/gemma-3-4b-pt"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = bnb_config,
                                             device_map={"":0})

In [None]:
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import json

In [None]:
from datasets import Dataset, load_dataset
import json

In [None]:
from datasets import Dataset, load_dataset
import json

# 1. Load dataset correctly
with open("train_data.json") as f:
    raw_data = json.load(f)

# Your existing formatting code
formatted_data = []
for row in raw_data:
    messages = row["messages"]
    # Add Gemma special tokens
    formatted_data.append({
        "prompt": messages[1]["content"].strip(),
        "response": messages[2]["content"].strip(),
        "text": f"<start_of_turn>\n{messages[1]['content'].strip()}\n<end_of_turn>\n{messages[2]['content'].strip()}"
    })

dataset = Dataset.from_list(formatted_data)

# 2. Revised validation function
def validate_dataset(ds: Dataset):
    try:
        # Check columns
        required = {"prompt", "response", "text"}
        assert required.issubset(ds.column_names), f"Missing columns: {required - set(ds.column_names)}"

        # Check sample data
        sample = ds[0]
        assert len(sample["prompt"]) > 10, f"Prompt too short: '{sample['prompt']}'"
        assert len(sample["response"]) > 5, f"Response too short: '{sample['response']}'"
        assert "<start_of_turn>" in sample["text"], "Missing start_of_turn token"
        assert "<end_of_turn>" in sample["text"], "Missing end_of_turn token"

        print("Dataset validation passed!")
        return True
    except AssertionError as e:
        print(f"Validation failed: {str(e)}")
        return False

# 3. Run validation correctly
validate_dataset(dataset)  # Pass Dataset object, not filename


Dataset validation passed!


True

In [None]:
# Prepare model for PEFT
model = prepare_model_for_kbit_training(model)

# LoRA configuration (updated with optimal parameters)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none"
)

# Add Gemma special tokens
tokenizer.add_special_tokens({
    'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']
})

model = get_peft_model(model, lora_config)

# Load and format dataset (fixed message handling)
with open("train_data.json") as f:
    raw_data = json.load(f)

formatted_data = []
for row in raw_data:
    messages = row["messages"]
    # Directly access messages without nested loop
    prompt = messages[1]["content"].strip()  # User message
    response = messages[2]["content"].strip()  # Assistant message

    # Add Gemma chat template formatting
    formatted_text = (
        f"<start_of_turn>user\n{prompt}<end_of_turn>\n"
        f"<start_of_turn>assistant\n{response}<end_of_turn>"
    )

    formatted_data.append({
        "prompt": prompt,
        "response": response,
        "text": formatted_text
    })

dataset = Dataset.from_list(formatted_data)

# Tokenization (optimized for Gemma)
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding="longest",  # More efficient than max_length
        truncation=True,
        max_length=8192,    # Gemma's context window
        add_special_tokens=False  # We added them manually
    )

tokenized_dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = tokenized_dataset,
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 7,
        max_steps = 90,
        learning_rate = 2e-4,
        bf16 = True,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "paged_adamw_8bit"
    ),
    peft_config = lora_config,
    # formatting_func = formatting_func

)

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muday_t[0m ([33muday_t-iit-roorkee[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,18.3043
2,23.2491
3,25.6222
4,27.0042
5,26.335
6,29.3058
7,21.9047
8,19.3702
9,26.0585
10,11.3742


TrainOutput(global_step=90, training_loss=4.679828112696608, metrics={'train_runtime': 4709.4365, 'train_samples_per_second': 0.076, 'train_steps_per_second': 0.019, 'total_flos': 8052209359257600.0, 'train_loss': 4.679828112696608})

In [None]:
fine_tuned_model = "fine_tuned_gemma3"
trainer.model.save_pretrained(fine_tuned_model)

# Push the model on Hugging Face.
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    return_dict = True,
    torch_dtype = torch.bfloat16,
    device_map = {"": 0}
)

# Merge the fine-tuned model with LoRA adaption along with the base Gemma 2b-it model.
fine_tuned_merged_model = PeftModel.from_pretrained(base_model, fine_tuned_model)
fine_tuned_merged_model = fine_tuned_merged_model.merge_and_unload()

# Save the fine-tuned merged model.
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
fine_tuned_merged_model.save_pretrained("fine_tuned_science_gemma3", safe_serialization = True)
tokenizer.save_pretrained("fine_tuned_science_gemma3")
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Zip the fine_tuned_gemma3 folder (LoRA adapters)
!zip -r fine_tuned_gemma3.zip fine_tuned_gemma3

# Zip the merged full model
!zip -r fine_tuned_science_gemma3.zip fine_tuned_science_gemma3

#zip the outputs (checkpoints and logs)
!zip -r outputs.zip outputs


  adding: fine_tuned_gemma3/ (stored 0%)
  adding: fine_tuned_gemma3/README.md (deflated 66%)
  adding: fine_tuned_gemma3/adapter_config.json (deflated 56%)
  adding: fine_tuned_gemma3/adapter_model.safetensors (deflated 12%)
  adding: fine_tuned_science_gemma3/ (stored 0%)
  adding: fine_tuned_science_gemma3/model-00001-of-00002.safetensors (deflated 21%)
  adding: fine_tuned_science_gemma3/added_tokens.json (stored 0%)
  adding: fine_tuned_science_gemma3/tokenizer_config.json (deflated 97%)
  adding: fine_tuned_science_gemma3/generation_config.json (deflated 34%)
  adding: fine_tuned_science_gemma3/special_tokens_map.json (deflated 77%)
  adding: fine_tuned_science_gemma3/tokenizer.model (deflated 52%)
  adding: fine_tuned_science_gemma3/tokenizer.json (deflated 83%)
  adding: fine_tuned_science_gemma3/model-00002-of-00002.safetensors (deflated 21%)
  adding: fine_tuned_science_gemma3/model.safetensors.index.json (deflated 97%)
  adding: fine_tuned_science_gemma3/config.json (deflate

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # for precise error reporting

from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = "fine_tuned_science_gemma3"
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype=torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

prompt = "give market segmentation for startup idea: ai based leetcode platform\nAnswer:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Sanity‐check EOS
# assert tokenizer.eos_token_id < model.config.vocab_size, "eos_token_id out of bounds!"

outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    eos_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



give market segmentation for startup idea: ai based leetcode platform
Answer:
[
  {
    "Title/Description": "Coding Enthusiasts",
    "End Users": "Programmers, Developers, Software Engineers",
    "Urgency of Need": "High",
    "Example Customers": "Software companies, IT firms, freelancers",
    "Lead Customers": "Large tech companies",
    "Willingness to Change": "Moderate",
    "Concentration of Buyers": "Many small buyers",
    "Other Relevant Market Considerations": "High competition, need for up-to-date resources"
  },
  {
    "Title/Description": "Educators",
    "End Users": "Teachers, Professors, Educational Institutions",
    "Urgency of Need": "Medium",
    "Example Customers": "Universities, schools",
    "Lead Customers": "Educational technology companies",
    "Willingness to Change": "High",
    "Concentration of Buyers": "Many small buyers",
    "Other Relevant Market Considerations": "Need for interactive learning tools"
  },
  {
    "Title/Description": "Online Lea