# VAZHI Gemma-2B Tamil Fine-tuning with Unsloth

**Unsloth = 4x faster training!**

Using Unsloth for optimized fine-tuning that properly handles 4-bit quantization.

**Expected time:** ~30-60 minutes (vs 10+ hours without Unsloth)

In [None]:
%%capture
!pip install unsloth
!pip install --upgrade --no-cache-dir huggingface_hub

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Step 1: Load Model with Unsloth

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 512

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="abhinand/gemma-2b-it-tamil-v0.1-alpha",
    max_seq_length=max_seq_length,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # Unsloth optimizes 4-bit!
)

print(f"Model loaded with Unsloth!")

## Step 2: Add LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Optimized for 0
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth optimized
    random_state=42,
)

model.print_trainable_parameters()

## Step 3: Test Before Training

In [None]:
def test_model(prompt):
    inputs = tokenizer(
        f"### Instruction:\n{prompt}\n\n### Response:\n",
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()

print("BEFORE TRAINING:")
print(f"Q: தமிழ்நாட்டின் தலைநகரம் எது?")
print(f"A: {test_model('தமிழ்நாட்டின் தலைநகரம் எது?')[:200]}")

## Step 4: Load VAZHI Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("CryptoYogi/vazhi-tamil-v05")
print(f"Train: {len(dataset['train'])} samples")
print(f"Val: {len(dataset['validation'])} samples")

In [None]:
def format_prompt(example):
    """Format for training"""
    if 'text' in example and example['text']:
        return example['text']
    elif 'instruction' in example and 'output' in example:
        return f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    return ""

# Format datasets
train_data = dataset['train'].map(lambda x: {"text": format_prompt(x)})
train_data = train_data.filter(lambda x: len(x['text']) > 10)

print(f"Formatted: {len(train_data)} samples")
print(f"Sample: {train_data[0]['text'][:200]}...")

## Step 5: Training with Unsloth

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        output_dir="./vazhi-gemma-unsloth",
        
        # Fast settings
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        
        # Learning
        learning_rate=2e-4,
        num_train_epochs=1,
        warmup_steps=50,
        
        # Optimizer
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        
        # Precision
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        
        # Logging
        logging_steps=25,
        save_steps=200,
        save_total_limit=2,
        
        # Speed
        seed=42,
        report_to="none",
    ),
)

print(f"Training config ready!")
print(f"Effective batch size: {4 * 4} = 16")
print(f"Steps: ~{len(train_data) // 16}")

In [None]:
print("Starting training with Unsloth optimization...")
print("Expected time: ~30-60 minutes")

trainer_stats = trainer.train()

print(f"\nTraining complete!")
print(f"Total time: {trainer_stats.metrics['train_runtime'] / 60:.1f} minutes")

## Step 6: Test After Training

In [None]:
print("\n" + "="*60)
print("AFTER TRAINING:")
print("="*60)

test_prompts = [
    "தமிழ்நாட்டின் தலைநகரம் எது?",
    "திருக்குறளின் முதல் குறள் என்ன?",
    "PM-KISAN திட்டம் என்ன?",
    "இந்த SMS உண்மையா? 'நீங்கள் lottery வென்றீர்கள், ₹500 அனுப்புங்கள்'",
]

for prompt in test_prompts:
    print(f"\nQ: {prompt}")
    print(f"A: {test_model(prompt)[:300]}")

## Step 7: Save Model

In [None]:
# Save LoRA only (small)
model.save_pretrained("./vazhi-lora")
tokenizer.save_pretrained("./vazhi-lora")
print("LoRA saved!")

In [None]:
# Merge and save full model
model.save_pretrained_merged(
    "./vazhi-merged",
    tokenizer,
    save_method="merged_16bit",
)
print("Merged model saved!")

## Step 8: Convert to GGUF

In [None]:
# Unsloth can export directly to GGUF!
model.save_pretrained_gguf(
    "./vazhi-gguf",
    tokenizer,
    quantization_method="q4_k_m",
)
print("GGUF Q4_K_M saved!")

In [None]:
import os

# Check output files
for root, dirs, files in os.walk("./vazhi-gguf"):
    for f in files:
        path = os.path.join(root, f)
        size = os.path.getsize(path) / 1e9
        if size > 0.1:
            print(f"{f}: {size:.2f} GB")

## Step 9: Test GGUF

In [None]:
!pip install -q llama-cpp-python

In [None]:
from llama_cpp import Llama
import glob

# Find the GGUF file
gguf_files = glob.glob("./vazhi-gguf/*.gguf")
if gguf_files:
    gguf_path = gguf_files[0]
    print(f"Testing: {gguf_path}")
    
    llm = Llama(
        model_path=gguf_path,
        n_ctx=512,
        n_threads=4,
        verbose=False
    )
    
    print("\n" + "="*60)
    print("GGUF Q4_K_M TEST:")
    print("="*60)
    
    for prompt in test_prompts:
        print(f"\nQ: {prompt}")
        response = llm(
            f"### Instruction:\n{prompt}\n\n### Response:\n",
            max_tokens=150,
            stop=["###", "\n\n"],
            echo=False
        )
        print(f"A: {response['choices'][0]['text'].strip()[:300]}")
else:
    print("No GGUF file found!")

## Step 10: Upload to HuggingFace (Optional)

In [None]:
# Uncomment to upload
# model.push_to_hub_gguf(
#     "CryptoYogi/vazhi-gemma-v1",
#     tokenizer,
#     quantization_method="q4_k_m",
#     token="your_hf_token"
# )

## Summary

**Unsloth Advantages:**
- 4x faster training
- Proper 4-bit handling (no corruption!)
- Direct GGUF export
- ~30-60 min vs 10+ hours

**Output:**
- `vazhi-gguf/*.gguf` - Ready for mobile deployment!
- Size: ~1.6 GB