# VAZHI v0.5 - Qwen2.5-0.5B Training

**Goal**: Train a Tamil AI assistant using Qwen2.5-0.5B (~250MB GGUF)

**Strategy**:
1. Use Qwen2.5-0.5B-Instruct as base (good multilingual tokenizer)
2. Fine-tune with 11,696 Tamil foundation items
3. Mix of Q&A (instruction-following) and completion (fluency)
4. Quantize to Q4_K_M for mobile deployment

**Dataset**: CryptoYogi/vazhi-tamil-v05
- Train: 11,112 items
- Val: 584 items
- Format: Q&A + Completion mix

**Platform**: Google Colab with T4 GPU

## 1. Setup

In [None]:
# Install dependencies
!pip install -q torch transformers datasets peft accelerate bitsandbytes
!pip install -q trl huggingface_hub sentencepiece

In [None]:
from huggingface_hub import login
from google.colab import userdata

# Try to get token from Colab secrets, otherwise prompt
try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("Logged in via Colab secret")
except:
    login()  # Manual token entry

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
import gc

# Configuration
BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
DATASET = "CryptoYogi/vazhi-tamil-v05"
OUTPUT_DIR = "./vazhi-qwen-05b"
LORA_OUTPUT = "./vazhi-qwen-lora"

# Check GPU
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
if torch.cuda.is_available():
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Load Model

In [None]:
print("Loading Qwen2.5-0.5B-Instruct...")

# 4-bit quantization for training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Model loaded. Parameters: {model.num_parameters():,}")
print(f"Vocab size: {tokenizer.vocab_size:,}")

## 3. Test Base Model (Before Training)

In [None]:
def test_model(model, tokenizer, question, max_tokens=150):
    """Test model with a Tamil question"""
    system = "நீங்கள் VAZHI, தமிழ் AI உதவியாளர். தமிழில் பதிலளியுங்கள்."
    
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": question}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant response
    if "assistant" in response.lower():
        parts = response.split("assistant")
        if len(parts) > 1:
            return parts[-1].strip()
    return response

# Test questions
test_questions = [
    "திருக்குறளின் முதல் குறள் என்ன?",
    "வணக்கம், நீங்கள் யார்?",
    "தமிழ்நாட்டின் தலைநகரம் எது?",
    "ஔவையாரின் ஆத்திசூடி பற்றி சொல்லுங்கள்",
]

print("=" * 60)
print("BEFORE TRAINING - Base Qwen2.5-0.5B")
print("=" * 60)

for q in test_questions:
    print(f"\nQ: {q}")
    print(f"A: {test_model(model, tokenizer, q)}")

## 4. Load Training Data

In [None]:
# Load dataset from HuggingFace
print(f"Loading dataset: {DATASET}")

dataset = load_dataset(DATASET)
train_dataset = dataset["train"]

# If there's no explicit split, create one from JSONL
if "train" not in dataset:
    from datasets import Dataset
    import json
    
    # Load from raw files
    train_data = []
    with open("train.jsonl", "r") as f:
        for line in f:
            train_data.append(json.loads(line))
    train_dataset = Dataset.from_list(train_data)

print(f"\nTraining examples: {len(train_dataset)}")
print(f"\nSample:\n{train_dataset[0]['text'][:500]}...")

## 5. Configure LoRA

In [None]:
# LoRA Configuration for Qwen2.5-0.5B
lora_config = LoraConfig(
    r=32,  # Rank
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable_params:,} / {all_params:,} = {100 * trainable_params / all_params:.2f}%")

## 6. Train the Model

In [None]:
# For TRL 0.27+ 
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    bf16=True,  # Use bf16 for Qwen models
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    report_to="none",
    gradient_checkpointing=True,
)

def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    formatting_func=formatting_func,
)

print("Starting training...")

In [None]:
# Train!
trainer.train()

print("\nTraining complete!")

In [None]:
# Save LoRA adapter
model.save_pretrained(LORA_OUTPUT)
tokenizer.save_pretrained(LORA_OUTPUT)
print(f"LoRA saved to {LORA_OUTPUT}")

## 7. Test After Training

In [None]:
print("=" * 60)
print("AFTER TRAINING - Fine-tuned Qwen2.5-0.5B")
print("=" * 60)

for q in test_questions:
    print(f"\nQ: {q}")
    print(f"A: {test_model(model, tokenizer, q)}")

## 8. Merge and Save Full Model

In [None]:
# Clear memory first
del trainer
gc.collect()
torch.cuda.empty_cache()

# Reload base model in full precision for merging
print("Reloading base model for merging...")
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, LORA_OUTPUT)

# Merge
print("Merging LoRA weights...")
merged_model = model.merge_and_unload()

# Save merged model
MERGED_OUTPUT = "./vazhi-qwen-merged"
merged_model.save_pretrained(MERGED_OUTPUT, safe_serialization=True)
tokenizer.save_pretrained(MERGED_OUTPUT)
print(f"Merged model saved to {MERGED_OUTPUT}")

!ls -lh {MERGED_OUTPUT}

## 9. Convert to GGUF

In [None]:
# Setup llama.cpp
!git clone https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp && pip install -q -r requirements.txt

In [None]:
# Convert to GGUF F16
print("Converting to GGUF F16...")
!python llama.cpp/convert_hf_to_gguf.py {MERGED_OUTPUT} --outfile vazhi-qwen-f16.gguf --outtype f16
!ls -lh vazhi-qwen-f16.gguf

In [None]:
# Build llama.cpp quantize tool
!cd llama.cpp && mkdir -p build && cd build && cmake .. && make -j4 llama-quantize

In [None]:
# Quantize to Q8_0
print("Quantizing to Q8_0...")
!./llama.cpp/build/bin/llama-quantize vazhi-qwen-f16.gguf vazhi-qwen-q8_0.gguf q8_0
!ls -lh vazhi-qwen-q8_0.gguf

In [None]:
# Quantize to Q4_K_M (target for mobile)
print("Quantizing to Q4_K_M...")
!./llama.cpp/build/bin/llama-quantize vazhi-qwen-f16.gguf vazhi-qwen-q4_k_m.gguf q4_k_m

print("\nAll GGUF files:")
!ls -lh vazhi-qwen-*.gguf

## 10. Test GGUF Models

In [None]:
# Build llama-cli for testing
!cd llama.cpp && cd build && make -j4 llama-cli

In [None]:
# Test Q4_K_M (primary target)
print("\n" + "="*60)
print("Testing: GGUF Q4_K_M")
print("="*60)

test_prompt = """<|im_start|>system
நீங்கள் VAZHI, தமிழ் AI உதவியாளர்.<|im_end|>
<|im_start|>user
திருக்குறளின் முதல் குறள் என்ன?<|im_end|>
<|im_start|>assistant
"""

!./llama.cpp/build/bin/llama-cli -m vazhi-qwen-q4_k_m.gguf \
    -p "{test_prompt}" \
    -n 150 --temp 0.7 -ngl 0

In [None]:
# Test with different questions
print("\nTesting additional questions...")

questions = [
    "வணக்கம், நீங்கள் யார்?",
    "ஔவையாரின் ஆத்திசூடி என்ன?",
    "என்னை ஏமாற்ற முயற்சிக்கிறார்கள், என்ன செய்வது?",
]

for q in questions:
    prompt = f"""<|im_start|>system
நீங்கள் VAZHI, தமிழ் AI உதவியாளர்.<|im_end|>
<|im_start|>user
{q}<|im_end|>
<|im_start|>assistant
"""
    print(f"\nQ: {q}")
    !./llama.cpp/build/bin/llama-cli -m vazhi-qwen-q4_k_m.gguf -p "{prompt}" -n 150 --temp 0.7 -ngl 0 2>/dev/null | tail -20

## 11. Upload to HuggingFace

In [None]:
from huggingface_hub import HfApi, create_repo

api = HfApi()
GGUF_REPO = "CryptoYogi/vazhi-qwen-gguf"

# Create repo
create_repo(GGUF_REPO, repo_type="model", exist_ok=True)
print(f"Repository: {GGUF_REPO}")

# Upload Q4_K_M (primary for mobile)
print("\nUploading Q4_K_M...")
api.upload_file(
    path_or_fileobj="vazhi-qwen-q4_k_m.gguf",
    path_in_repo="vazhi-qwen-q4_k_m.gguf",
    repo_id=GGUF_REPO,
)

# Upload Q8_0 (backup)
print("Uploading Q8_0...")
api.upload_file(
    path_or_fileobj="vazhi-qwen-q8_0.gguf",
    path_in_repo="vazhi-qwen-q8_0.gguf",
    repo_id=GGUF_REPO,
)

print(f"\nDone! Models at: https://huggingface.co/{GGUF_REPO}")

## 12. Summary

In [None]:
print("\n" + "="*60)
print("VAZHI v0.5 TRAINING SUMMARY")
print("="*60)

print("\n Base Model: Qwen2.5-0.5B-Instruct")
print(f" Dataset: {DATASET}")
print(" Training: 11,112 items (3 epochs)")

print("\nGGUF File Sizes:")
!ls -lh vazhi-qwen-*.gguf

print("""
Expected sizes:
- F16:     ~1GB
- Q8_0:    ~500MB
- Q4_K_M:  ~250MB  <-- Target for mobile!

Next Steps:
1. Download Q4_K_M from HuggingFace
2. Test in llama.cpp or mobile app
3. Integrate with VAZHI app
""")