# VAZHI Gemma-2B Tamil Fine-tuning (Optimized)

**No Unsloth required** - uses standard HF with speed optimizations.

**Key optimizations:**
- 8-bit AdamW optimizer
- Flash Attention 2
- Gradient checkpointing
- Mixed precision (bf16)

**Expected time:** ~1-2 hours on Kaggle T4x2

In [None]:
# Install dependencies (shows progress)
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install accelerate

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

## Step 1: Load Model in 4-bit

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "abhinand/gemma-2b-it-tamil-v0.1-alpha"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with 4-bit on single GPU (required for training)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0},  # Single GPU - required for 4-bit training
    trust_remote_code=True,
    attn_implementation="eager",  # Gemma doesn't support flash_attention_2 well
)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

print(f"Model loaded in 4-bit!")
print(f"Model memory: {model.get_memory_footprint() / 1e9:.2f} GB")

## Step 2: Add LoRA Adapters

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## Step 3: Test Before Training

In [None]:
def test_model(prompt, max_tokens=100):
    inputs = tokenizer(
        f"### Instruction:\n{prompt}\n\n### Response:\n",
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()

print("BEFORE TRAINING:")
print(f"Q: தமிழ்நாட்டின் தலைநகரம் எது?")
print(f"A: {test_model('தமிழ்நாட்டின் தலைநகரம் எது?')[:200]}")

## Step 4: Load VAZHI Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("CryptoYogi/vazhi-tamil-v05")
print(f"Train: {len(dataset['train'])} samples")
print(f"Val: {len(dataset['validation'])} samples")

In [None]:
def format_prompt(example):
    """Format for training"""
    if 'text' in example and example['text']:
        return {"text": example['text']}
    elif 'instruction' in example and 'output' in example:
        return {"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"}
    return {"text": ""}

# Format datasets
train_data = dataset['train'].map(format_prompt)
train_data = train_data.filter(lambda x: len(x['text']) > 10)

print(f"Formatted: {len(train_data)} samples")
print(f"Sample: {train_data[0]['text'][:200]}...")

## Step 5: Training

In [None]:
from trl import SFTTrainer, SFTConfig

# Detect GPU capability for precision
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else ""
use_bf16 = "A100" in gpu_name or "H100" in gpu_name or "RTX 30" in gpu_name or "RTX 40" in gpu_name
use_fp16 = not use_bf16
print(f"GPU: {gpu_name}")
print(f"Using: {'bf16' if use_bf16 else 'fp16'}")

# SFTConfig combines training args
training_config = SFTConfig(
    output_dir="./vazhi-gemma-fast",
    
    # Dataset
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
    
    # Batch settings - smaller batches, more accumulation
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # Effective batch = 16
    
    # Learning
    learning_rate=2e-4,
    num_train_epochs=1,
    warmup_ratio=0.05,
    
    # 8-bit optimizer - KEY for speed!
    optim="paged_adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    
    # Precision - auto-detect based on GPU
    fp16=use_fp16,
    bf16=use_bf16,
    
    # Logging
    logging_steps=25,
    save_steps=200,
    save_total_limit=2,
    
    # Speed optimizations
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    
    seed=42,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_data,
    args=training_config,
)

print(f"Training config ready!")
print(f"Effective batch size: {2 * 8} = 16")
print(f"Steps: ~{len(train_data) // 16}")

In [None]:
import time

print("Starting training...")
print("Expected time: ~1-2 hours")
start = time.time()

trainer_stats = trainer.train()

elapsed = time.time() - start
print(f"\nTraining complete!")
print(f"Total time: {elapsed / 60:.1f} minutes")
print(f"Final loss: {trainer_stats.training_loss:.4f}")

## Step 6: Test After Training

In [None]:
print("\n" + "="*60)
print("AFTER TRAINING:")
print("="*60)

test_prompts = [
    "தமிழ்நாட்டின் தலைநகரம் எது?",
    "திருக்குறளின் முதல் குறள் என்ன?",
    "PM-KISAN திட்டம் என்ன?",
    "இந்த SMS உண்மையா? 'நீங்கள் lottery வென்றீர்கள், ₹500 அனுப்புங்கள்'",
]

for prompt in test_prompts:
    print(f"\nQ: {prompt}")
    print(f"A: {test_model(prompt)[:300]}")

## Step 7: Save Model

In [None]:
# Save LoRA adapter
model.save_pretrained("./vazhi-lora")
tokenizer.save_pretrained("./vazhi-lora")
print("LoRA adapter saved!")

In [None]:
# Merge LoRA into base model
from peft import AutoPeftModelForCausalLM

# Reload in 16-bit for merging
merged_model = AutoPeftModelForCausalLM.from_pretrained(
    "./vazhi-lora",
    torch_dtype=torch.float16,
    device_map="auto",
)
merged_model = merged_model.merge_and_unload()

# Save merged
merged_model.save_pretrained("./vazhi-merged")
tokenizer.save_pretrained("./vazhi-merged")
print("Merged model saved!")

## Step 8: Convert to GGUF

In [None]:
# Install llama.cpp for conversion
!pip install -q llama-cpp-python
!git clone --depth 1 https://github.com/ggerganov/llama.cpp /kaggle/working/llama.cpp

In [None]:
# Convert to GGUF F16 first
!cd /kaggle/working/llama.cpp && pip install -q -r requirements.txt
!python /kaggle/working/llama.cpp/convert_hf_to_gguf.py ./vazhi-merged --outfile ./vazhi-f16.gguf --outtype f16

In [None]:
# Build llama.cpp for quantization
!cd /kaggle/working/llama.cpp && make -j quantize

In [None]:
# Quantize to Q4_K_M
!/kaggle/working/llama.cpp/llama-quantize ./vazhi-f16.gguf ./vazhi-q4km.gguf Q4_K_M

In [None]:
import os

# Check output files
for f in ["./vazhi-f16.gguf", "./vazhi-q4km.gguf"]:
    if os.path.exists(f):
        size = os.path.getsize(f) / 1e9
        print(f"{f}: {size:.2f} GB")

## Step 9: Test GGUF

In [None]:
from llama_cpp import Llama

llm = Llama(
    model_path="./vazhi-q4km.gguf",
    n_ctx=512,
    n_threads=4,
    verbose=False
)

print("\n" + "="*60)
print("GGUF Q4_K_M TEST:")
print("="*60)

for prompt in test_prompts:
    print(f"\nQ: {prompt}")
    response = llm(
        f"### Instruction:\n{prompt}\n\n### Response:\n",
        max_tokens=150,
        stop=["###", "\n\n"],
        echo=False
    )
    print(f"A: {response['choices'][0]['text'].strip()[:300]}")

## Step 10: Download GGUF

Download `vazhi-q4km.gguf` (~1.6 GB) for mobile deployment!

In [None]:
# Copy to output for easy download
import shutil
shutil.copy("./vazhi-q4km.gguf", "/kaggle/working/vazhi-q4km.gguf")
print("GGUF copied to /kaggle/working/ for download!")

## Summary

**Optimizations used:**
- 4-bit quantization with double quant
- 8-bit paged AdamW optimizer
- Gradient checkpointing
- BF16 mixed precision

**Output:**
- `vazhi-q4km.gguf` - Q4_K_M for mobile (~1.6 GB)