# VAZHI v0.4 - High-Tamil LoRA Training

**Key improvements over v0.2:**
- 86.7% Tamil content (vs 2.6% in original data)
- Fixed formatting function (batch-compatible)
- Tamil-first bilingual format
- 512 max tokens for inference
- Standardized data format
- Content-deduplicated (0 duplicates)

**Training Data:**
- Train: 2,732 samples
- Val: 301 samples
- Total: 3,033 unique samples

**Pack Distribution (balanced across all 6 domains):**
- vazhi_panpaadu (Culture): 625 samples
- vazhi_sattam (Legal): 590 samples
- vazhi_kaval (Security): 467 samples
- vazhi_maruthuvam (Healthcare): 462 samples
- vazhi_arasu (Government): 452 samples
- vazhi_kalvi (Education): 437 samples

**Data Location:** `data/v04/training/`

**Run on GPU runtime (T4 or A100)**

In [None]:
# Install dependencies
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install datasets

In [None]:
# Upload training data
from google.colab import files
print("Upload vazhi_v04_train.json and vazhi_v04_val.json")
uploaded = files.upload()

# Verify
import os
for f in os.listdir('.'):
    if f.endswith('.json'):
        print(f"  - {f} ({os.path.getsize(f) / 1024:.1f} KB)")

In [None]:
# Load model with Unsloth
from unsloth import FastLanguageModel
import torch

MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MAX_SEQ_LENGTH = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
)

print(f"Model loaded! Parameters: {model.num_parameters():,}")

In [None]:
# Configure LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print(f"LoRA configured! Trainable: {model.num_parameters(only_trainable=True):,}")

In [None]:
# VAZHI v0.4 Tamil-first system prompt
SYSTEM_PROMPT = """நீங்கள் வழி (VAZHI), தமிழ்நாடு மக்களுக்கான AI உதவியாளர். தமிழில் தெளிவாகவும் உதவியாகவும் பதிலளியுங்கள். தொழில்நுட்ப சொற்களை தமிழில் முதலில் கூறி, பிறகு ஆங்கிலத்தில் அடைப்புக்குறிக்குள் குறிப்பிடுங்கள்."""

# Fixed formatting function - handles both single and batched examples
def format_prompt(examples):
    """Format training examples into Qwen chat template. Returns list."""
    texts = []
    
    # Handle both single example and batch
    if isinstance(examples['instruction'], list):
        # Batched
        for instruction, output in zip(examples['instruction'], examples['output']):
            text = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{instruction}<|im_end|>
<|im_start|>assistant
{output}<|im_end|>"""
            texts.append(text)
    else:
        # Single example
        text = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{examples['instruction']}<|im_end|>
<|im_start|>assistant
{examples['output']}<|im_end|>"""
        texts.append(text)
    
    return texts

print("Formatting function configured!")

In [None]:
# Load training data
import json
from datasets import Dataset

with open("vazhi_v04_train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("vazhi_v04_val.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"Training: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")

# Show sample
print(f"\nSample instruction: {train_dataset[0]['instruction'][:80]}...")
print(f"Sample pack: {train_dataset[0]['pack']}")

In [None]:
# Verify Tamil content in data
import re

def tamil_pct(text):
    if not text: return 0
    tamil = len(re.findall(r'[\u0B80-\u0BFF]', text))
    total = len(re.findall(r'[\u0B80-\u0BFF\w]', text))
    return (tamil / total * 100) if total > 0 else 0

# Check first 100 samples
pcts = [tamil_pct(s['output']) for s in train_data[:100]]
print(f"Average Tamil %: {sum(pcts)/len(pcts):.1f}%")
print(f"Min Tamil %: {min(pcts):.1f}%")
print(f"Max Tamil %: {max(pcts):.1f}%")

In [None]:
# Training configuration
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir="./vazhi-v04-lora",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    learning_rate=2e-4,
    warmup_steps=50,
    optim="adamw_8bit",
    weight_decay=0.01,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=10,
    save_steps=200,
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=200,
    seed=42,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    formatting_func=format_prompt,
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args,
)

steps_per_epoch = len(train_dataset) // 8
print(f"Ready! Batch size: 8 | Steps/epoch: {steps_per_epoch} | Total steps: {steps_per_epoch * 3}")

In [None]:
# Start training!
import time

print("=" * 50)
print("VAZHI v0.4 Training Starting!")
print(f"Training {len(train_dataset)} high-Tamil samples")
print("=" * 50)

start = time.time()
trainer.train()
elapsed = time.time() - start

print("\n" + "=" * 50)
print(f"Training Complete! Time: {elapsed/60:.1f} minutes")
print("=" * 50)

In [None]:
# Save model
model.save_pretrained("vazhi-v04-lora-final")
tokenizer.save_pretrained("vazhi-v04-lora-final")

# Download
import shutil
shutil.make_archive("vazhi-v04-lora-final", 'zip', "vazhi-v04-lora-final")
files.download("vazhi-v04-lora-final.zip")
print("Model saved and downloaded!")

In [None]:
# Test the model - improved parsing from v0.2
FastLanguageModel.for_inference(model)

def vazhi_chat(user_message, max_tokens=512):
    """Chat with VAZHI v0.4 model."""
    prompt = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract assistant response (improved parsing)
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1]
    
    # Clean up markers
    response = response.replace("<|im_end|>", "").strip()
    response = response.replace("<|im_start|>", "").strip()
    
    return response

print("Inference function ready!")

In [None]:
# Comprehensive test across all packs
tests = [
    ("Culture", "திருக்குறளின் முதல் குறள் என்ன?"),
    ("Culture", "சித்தர்கள் யார்?"),
    ("Security", "மோசடி செய்தியை எப்படி கண்டறிவது?"),
    ("Government", "முதலமைச்சர் காப்பீட்டு அட்டை எப்படி பெறுவது?"),
    ("Education", "நீட் தேர்வுக்கு எப்படி தயாராவது?"),
    ("Legal", "தகவல் அறியும் உரிமைச் சட்டம் என்றால் என்ன?"),
    ("Healthcare", "அரசு மருத்துவமனையில் இலவச சிகிச்சை கிடைக்குமா?"),
]

print("=" * 60)
print("VAZHI v0.4 - High Tamil Test")
print("=" * 60)

for category, q in tests:
    print(f"\n[{category}] Q: {q}")
    response = vazhi_chat(q)
    # Check Tamil %
    pct = tamil_pct(response)
    print(f"A: {response[:300]}..." if len(response) > 300 else f"A: {response}")
    print(f"Tamil %: {pct:.1f}%")
    print("-" * 50)

In [None]:
# v0.4 Summary
print("""
╔══════════════════════════════════════════════════════════════╗
║              VAZHI v0.4 Training Complete!                   ║
╠══════════════════════════════════════════════════════════════╣
║  Key improvements:                                           ║
║  ┌─────────────────────┬───────────┬───────────┐             ║
║  │ Feature             │ v0.2      │ v0.4      │             ║
║  ├─────────────────────┼───────────┼───────────┤             ║
║  │ Tamil content       │ ~2.6%     │ 86.7%     │             ║
║  │ Training samples    │ 2,860*    │ 3,033     │             ║
║  │ Bilingual format    │ Mixed     │ Tamil 1st │             ║
║  │ Tanglish            │ Heavy     │ Minimal   │             ║
║  │ Data format         │ Mixed     │ Standard  │             ║
║  │ Duplicates          │ 320       │ 0         │             ║
║  └─────────────────────┴───────────┴───────────┘             ║
║  * v0.2 reported 3,180 but had 320 duplicate IDs             ║
║                                                              ║
║  Pack Distribution (all standardized as vazhi_*):            ║
║  - vazhi_panpaadu (Culture): 625 samples                     ║
║  - vazhi_sattam (Legal): 590 samples                         ║
║  - vazhi_kaval (Security): 467 samples                       ║
║  - vazhi_maruthuvam (Healthcare): 462 samples                ║
║  - vazhi_arasu (Government): 452 samples                     ║
║  - vazhi_kalvi (Education): 437 samples                      ║
║                                                              ║
║  Files saved: vazhi-v04-lora-final.zip                       ║
╚══════════════════════════════════════════════════════════════╝
""")

In [None]:
# Optional: Push to HuggingFace Hub
# Uncomment to upload

# from huggingface_hub import login
# login()  # Enter your token
# 
# model.push_to_hub("cryptoyogillc/vazhi-tamil-v04")
# tokenizer.push_to_hub("cryptoyogillc/vazhi-tamil-v04")
# print("Uploaded to HuggingFace Hub!")