# VAZHI GGUF Diagnostic v2 (Using Unsloth)

**CRITICAL FIX**: Must use Unsloth with `load_in_4bit=True` - same as training!

**Problem Found**: Training used Unsloth 4-bit, but our first diagnostic used standard transformers float16.

**Test Question**: திருக்குறளின் முதல் குறள் என்ன?

**Expected**: அகர முதல எழுத்தெல்லாம் ஆதி பகவன்...

In [None]:
# Install Unsloth - SAME as training!
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes

In [None]:
from huggingface_hub import login
login()

In [None]:
from unsloth import FastLanguageModel
import torch
import gc

# EXACT same config as training
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"
LORA_ADAPTER = "CryptoYogi/vazhi-lora"
MAX_SEQ_LENGTH = 2048

# EXACT same system prompt as training
SYSTEM_PROMPT = """நீங்கள் VAZHI (வழி), தமிழ் மக்களுக்கான AI உதவியாளர். தமிழில் தெளிவாகவும் உதவியாகவும் பதிலளியுங்கள். You can respond in Tamil, Tanglish, or English based on how the user asks."""

EXPECTED_KEYWORDS = ["அகர", "முதல", "எழுத்தெல்லாம்", "ஆதி", "பகவன்"]

def vazhi_chat(model, tokenizer, question, max_tokens=512):
    """Chat function - EXACT same as training notebook"""
    prompt = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1]
    response = response.replace("<|im_end|>", "").replace("<|im_start|>", "").strip()
    return response

def test_checkpoint(model, tokenizer, name):
    """Test and validate a checkpoint"""
    print(f"\n{'='*60}")
    print(f"Testing: {name}")
    print(f"{'='*60}")
    
    answer = vazhi_chat(model, tokenizer, "திருக்குறளின் முதல் குறள் என்ன?")
    print(f"\nResponse:\n{answer}\n")
    
    found = [kw for kw in EXPECTED_KEYWORDS if kw in answer]
    print(f"Keywords: {len(found)}/{len(EXPECTED_KEYWORDS)} - {found}")
    
    if len(found) >= 3:
        print("✅ PASS")
        return True
    else:
        print("❌ FAIL")
        return False

## Checkpoint 1: Load with Unsloth (SAME as training)

In [None]:
# Load base model - EXACT same as training
print("Loading base model with Unsloth (load_in_4bit=True)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,  # CRITICAL - same as training!
)
print(f"Model loaded. Parameters: {model.num_parameters():,}")

In [None]:
# Load LoRA adapter
from peft import PeftModel
print("Loading LoRA adapter from HuggingFace...")
model = PeftModel.from_pretrained(model, LORA_ADAPTER)
print("LoRA loaded!")

# Set to inference mode - CRITICAL!
FastLanguageModel.for_inference(model)
print("Inference mode enabled")

In [None]:
# Test Checkpoint 1
cp1 = test_checkpoint(model, tokenizer, "Checkpoint 1: LoRA + Unsloth 4-bit (same as training)")

## Checkpoint 2: Merge LoRA weights

In [None]:
print("Merging LoRA into base model...")
model = model.merge_and_unload()
print("Merged!")

# Test Checkpoint 2
cp2 = test_checkpoint(model, tokenizer, "Checkpoint 2: Merged (still 4-bit Unsloth)")

## Checkpoint 3: Save to float16 HuggingFace format

This is needed for GGUF conversion. But 4-bit models can't be directly converted.
We need to reload in float16 for proper saving.

In [None]:
# Clear memory
del model
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared")

In [None]:
# Reload in float16 for proper merge and save
print("Reloading base model in float16 (for proper GGUF conversion)...")
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
print("Base model loaded in float16")

In [None]:
# Load and merge LoRA in float16
from peft import PeftModel
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, LORA_ADAPTER, torch_dtype=torch.float16)
print("Merging...")
model = model.merge_and_unload()
print("Merged in float16!")

In [None]:
# Test Checkpoint 3 - merged float16 model
def test_hf_model(model, tokenizer, name):
    """Test HuggingFace model (non-Unsloth)"""
    print(f"\n{'='*60}")
    print(f"Testing: {name}")
    print(f"{'='*60}")
    
    prompt = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
திருக்குறளின் முதல் குறள் என்ன?<|im_end|>
<|im_start|>assistant
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<|im_start|>assistant" in response:
        answer = response.split("<|im_start|>assistant")[-1]
    else:
        answer = response
    answer = answer.replace("<|im_end|>", "").replace("<|im_start|>", "").strip()
    
    print(f"\nResponse:\n{answer}\n")
    
    found = [kw for kw in EXPECTED_KEYWORDS if kw in answer]
    print(f"Keywords: {len(found)}/{len(EXPECTED_KEYWORDS)} - {found}")
    
    if len(found) >= 3:
        print("✅ PASS")
        return True
    else:
        print("❌ FAIL")
        return False

cp3 = test_hf_model(model, tokenizer, "Checkpoint 3: Merged float16 HF model")

In [None]:
# Save merged model
MERGED_OUTPUT = "./vazhi-merged-f16"
print(f"Saving to {MERGED_OUTPUT}...")
model.save_pretrained(MERGED_OUTPUT, safe_serialization=True)
tokenizer.save_pretrained(MERGED_OUTPUT)
print("Saved!")
!ls -lh {MERGED_OUTPUT}

## Setup llama.cpp

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp && mkdir -p build && cd build && cmake .. && make -j4
!pip install -q -r llama.cpp/requirements.txt

## Checkpoint 4: Convert to GGUF F16

In [None]:
# Clear GPU memory before conversion
del model
del base_model
gc.collect()
torch.cuda.empty_cache()

print("Converting to GGUF F16...")
!python llama.cpp/convert_hf_to_gguf.py {MERGED_OUTPUT} --outfile vazhi-f16.gguf --outtype f16
!ls -lh vazhi-f16.gguf

In [None]:
# Test GGUF F16
print("\n" + "="*60)
print("Testing: Checkpoint 4: GGUF F16")
print("="*60)

!./llama.cpp/build/bin/llama-cli \
    -m vazhi-f16.gguf \
    -p "<|im_start|>system\nநீங்கள் VAZHI (வழி), தமிழ் மக்களுக்கான AI உதவியாளர். தமிழில் தெளிவாகவும் உதவியாகவும் பதிலளியுங்கள். You can respond in Tamil, Tanglish, or English based on how the user asks.<|im_end|>\n<|im_start|>user\nதிருக்குறளின் முதல் குறள் என்ன?<|im_end|>\n<|im_start|>assistant\n" \
    -n 150 \
    --temp 0.7 \
    -ngl 0 \
    --stop "<|im_end|>" \
    2>&1 | tail -30

## Checkpoint 5: Quantize to Q8_0

In [None]:
print("Quantizing to Q8_0...")
!./llama.cpp/build/bin/llama-quantize vazhi-f16.gguf vazhi-q8_0.gguf q8_0
!ls -lh vazhi-q8_0.gguf

In [None]:
# Test Q8_0
print("\n" + "="*60)
print("Testing: Checkpoint 5: GGUF Q8_0")
print("="*60)

!./llama.cpp/build/bin/llama-cli \
    -m vazhi-q8_0.gguf \
    -p "<|im_start|>system\nநீங்கள் VAZHI (வழி), தமிழ் மக்களுக்கான AI உதவியாளர். தமிழில் தெளிவாகவும் உதவியாகவும் பதிலளியுங்கள். You can respond in Tamil, Tanglish, or English based on how the user asks.<|im_end|>\n<|im_start|>user\nதிருக்குறளின் முதல் குறள் என்ன?<|im_end|>\n<|im_start|>assistant\n" \
    -n 150 \
    --temp 0.7 \
    -ngl 0 \
    --stop "<|im_end|>" \
    2>&1 | tail -30

## Checkpoint 6: Quantize to Q4_K_M

In [None]:
print("Quantizing to Q4_K_M...")
!./llama.cpp/build/bin/llama-quantize vazhi-f16.gguf vazhi-q4_k_m.gguf q4_k_m
!ls -lh vazhi-*.gguf

In [None]:
# Test Q4_K_M
print("\n" + "="*60)
print("Testing: Checkpoint 6: GGUF Q4_K_M")
print("="*60)

!./llama.cpp/build/bin/llama-cli \
    -m vazhi-q4_k_m.gguf \
    -p "<|im_start|>system\nநீங்கள் VAZHI (வழி), தமிழ் மக்களுக்கான AI உதவியாளர். தமிழில் தெளிவாகவும் உதவியாகவும் பதிலளியுங்கள். You can respond in Tamil, Tanglish, or English based on how the user asks.<|im_end|>\n<|im_start|>user\nதிருக்குறளின் முதல் குறள் என்ன?<|im_end|>\n<|im_start|>assistant\n" \
    -n 150 \
    --temp 0.7 \
    -ngl 0 \
    --stop "<|im_end|>" \
    2>&1 | tail -30

## Summary

In [None]:
print("""
╔══════════════════════════════════════════════════════════════╗
║                    DIAGNOSTIC SUMMARY                        ║
╠══════════════════════════════════════════════════════════════╣
║                                                              ║
║  Review outputs above and fill in:                          ║
║                                                              ║
║  | Checkpoint | Stage                  | Result |            ║
║  |------------|------------------------|--------|            ║
║  | 1          | LoRA + Unsloth 4-bit   |   ?    |            ║
║  | 2          | Merged Unsloth 4-bit   |   ?    |            ║
║  | 3          | Merged HF float16      |   ?    |            ║
║  | 4          | GGUF F16               |   ?    |            ║
║  | 5          | GGUF Q8_0              |   ?    |            ║
║  | 6          | GGUF Q4_K_M            |   ?    |            ║
║                                                              ║
║  First FAIL checkpoint = where quality loss happens         ║
║                                                              ║
╚══════════════════════════════════════════════════════════════╝
""")

print("\nFile sizes:")
!ls -lh vazhi-*.gguf 2>/dev/null || echo "No GGUF files yet"

## Upload best working model

In [None]:
# Uncomment and run to upload the best working model
# from huggingface_hub import HfApi
# api = HfApi()
# 
# # Upload Q8_0 if it works
# print("Uploading Q8_0...")
# api.upload_file(
#     path_or_fileobj="vazhi-q8_0.gguf",
#     path_in_repo="vazhi-q8_0.gguf",
#     repo_id="CryptoYogi/vazhi-gguf",
#     repo_type="model",
# )
# print("Uploaded!")