In [None]:
# Google Colab setup
!pip install transformers datasets peft accelerate bitsandbytes scikit-learn -q

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Authenticate with Hugging Face to use Gemma
from huggingface_hub import login
login()  # You'll need a HF token with access to Gemma

In [None]:
import pandas as pd
import json
from datasets import load_dataset
from sklearn.model_selection import train_test_split

ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
df = ds.to_pandas()

# Split into train and test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train.head()
# df_test.head()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id="google/gemma-3-1b-it"
device="cuda"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token   # required for Gemma
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float32 if device == "cpu" else torch.bfloat16,
    device_map="auto"
)

model.config.use_cache = False

print(f"dtype: {model.dtype}")

In [None]:
SYSTEM_PROMPT = {
    "role": "system",
    "content": "You are a helpful math assistant that solves problems step by step."
}

USER_MESSAGES = [
    {
        "role": "user",
        "content": row.problem  
    }
    for _, row in df_train.iloc[:5].iterrows()
]

POST_MESSAGE = {
        "role": "assistant",
        "content": "Solution: "
    }


PROMPTS = [
    [SYSTEM_PROMPT, USER_MSG, POST_MESSAGE]
    for USER_MSG in USER_MESSAGES
]

print(json.dumps(PROMPTS[0], indent=4))

tokenized = tokenizer.apply_chat_template(
    PROMPTS,
    continue_final_message=True,
    padding=True,
    return_tensors="pt"
).to(device)

out = model.generate(tokenized, max_new_tokens=14)

decoded = tokenizer.batch_decode(out)
print(decoded)

labels = [(d.split("\nSolution:")[-1]).strip() for d in decoded]
print(labels)

In [None]:
def generate_training_sample(conversation, target_response, max_length=512):
    """
    Generate a single training sample with proper label masking.
    
    For causal LM fine-tuning, we want:
    - input_ids: the full sequence (prompt + response)
    - labels: same as input_ids, but with -100 for tokens we don't want to compute loss on (the prompt)
    
    Args:
        conversation: list of message dicts with role/content (the prompt)
        target_response: the expected model output
        max_length: maximum sequence length
    
    Returns:
        dict with input_ids, attention_mask, labels
    """
    prompt_text = tokenizer.apply_chat_template(
        conversation,
        continue_final_message=True,
        tokenize=False,
        add_generation_prompt=False
    )
    
    full_text = prompt_text + target_response + tokenizer.eos_token
    
    full_encoding = tokenizer(
        full_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length,
        add_special_tokens=False  # chat_template already added special tokens
    )
    
    # Tokenize just the prompt to find where labels should start
    prompt_encoding = tokenizer(
        prompt_text,
        return_tensors="pt",
        add_special_tokens=False
    )
    prompt_length = prompt_encoding["input_ids"].shape[1]
    
    # Create labels: -100 for prompt tokens (ignore in loss), actual tokens for response
    labels = full_encoding["input_ids"].clone()
    labels[0, :prompt_length] = -100  # Mask the prompt
    labels[labels == tokenizer.pad_token_id] = -100  # Also mask padding
    
    return {
        "input_ids": full_encoding["input_ids"],
        "attention_mask": full_encoding["attention_mask"],
        "labels": labels
    }


In [None]:
test_conversation = [
    {"role": "system", "content": "You are a helpful math assistant that solves problems step by step."},
    {"role": "user", "content": df_train.iloc[0].problem},  
    {"role": "assistant", "content": "Solution: "}
]
test_response = df_train.iloc[0].solution

sample = generate_training_sample(test_conversation, test_response)
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Labels shape: {sample['labels'].shape}")
print(f"Non-masked label tokens: {(sample['labels'] != -100).sum().item()}")

In [None]:
from torch.utils.data import Dataset, DataLoader

class MathDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        conversation = [
            {"role": "system", "content": "You are a helpful math assistant that solves problems step by step."},
            {"role": "user", "content": row.problem},
            {"role": "assistant", "content": "Solution: "}
        ]
        
        target = row.solution
        
        sample = generate_training_sample(conversation, target, self.max_length)
        
        return {
            "input_ids": sample["input_ids"].squeeze(0),
            "attention_mask": sample["attention_mask"].squeeze(0),
            "labels": sample["labels"].squeeze(0)
        }

# Create dataset
dataset = MathDataset(df_train, tokenizer, max_length=512)
print(f"Dataset size: {len(dataset)}")

# Test one sample
test_item = dataset[0]
print(f"Sample input_ids shape: {test_item['input_ids'].shape}")

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# Configure LoRA for Gemma
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the update matrices (lower = fewer params, higher = more capacity)
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention projection layers
    bias="none",
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()


In [None]:
# Training configuration
from torch.optim import AdamW
from tqdm import tqdm

# Hyperparameters
BATCH_SIZE = 8
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
GRADIENT_ACCUMULATION_STEPS = 1  # Effective batch size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS

train_loader = DataLoader(
    dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    drop_last=True
)

optimizer = AdamW(
    filter(lambda p: p.requires_grad, peft_model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=0.01
)

print(f"Training samples: {len(dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Steps per epoch: {len(train_loader)}")
print(f"Total training steps: {len(train_loader) * NUM_EPOCHS}")


In [None]:
# Training loop
peft_model.train()

for epoch in range(NUM_EPOCHS):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    
    for step, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = peft_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        
        # Backward pass
        loss.backward()
        
        # Update weights every GRADIENT_ACCUMULATION_STEPS
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        progress_bar.set_postfix({"loss": f"{loss.item() * GRADIENT_ACCUMULATION_STEPS:.4f}"})
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

print("Training complete!")


In [None]:
# Mount Google Drive and save there
from google.colab import drive
drive.mount('/content/drive')

OUTPUT_DIR = "/content/drive/MyDrive/gemma-math-lora"
peft_model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# Test inference with the fine-tuned model
peft_model.eval()

def generate_response(problem, max_new_tokens=256):
    """Generate a response for a given math problem."""
    conversation = [
        {"role": "system", "content": "You are a helpful math assistant that solves problems step by step."},
        {"role": "user", "content": problem},
        {"role": "assistant", "content": "Solution: "}
    ]
    
    prompt = tokenizer.apply_chat_template(
        conversation,
        continue_final_message=True,
        tokenize=False
    )
    
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(device)
    
    with torch.no_grad():
        outputs = peft_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Deterministic for testing
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant's response
    return response.split("Solution: ")[-1]

# Test with a sample from the MATH-500 dataset
test_problem = df.iloc[0].problem
print(f"Problem: {test_problem}\n")
print(f"Expected answer: {df.iloc[0].answer}\n")
print(f"Model response:\n{generate_response(test_problem)}")
