# VAZHI Eval v4.0 ‚Äî Generation & Evaluation

Standalone eval notebook for the SFT v4.0 merged model.

**Why separate?** SFT training completed and model was uploaded to HF, but `<think>` token
suppression failed during eval ‚Äî `suppress_tokens` kwarg doesn't work in transformers 2.8.0.
This notebook fixes generation and re-evaluates without retraining.

**Model:** `CryptoYogi/vazhi-v4_0` (SFT on DAPT v1.1, already on HuggingFace)

**Fix:** Use `LogitsProcessorList` with `SuppressTokensLogitsProcessor` instead of
`suppress_tokens` kwarg. Also strip `<think>...</think>` as belt-and-suspenders fallback.

**Platform:** Kaggle T4 or Colab T4 (single GPU, ~1.2GB for fp16 model)

In [None]:
!pip install -q -U \
  "transformers>=4.45.0,<5.0.0" \
  "accelerate>=0.34.2"

print("\u2705 Dependencies installed")
print("\u26a0\ufe0f  RESTART THE SESSION NOW (Runtime \u2192 Restart session)")

## 1. Configuration

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
import torch
import numpy as np
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    LogitsProcessorList,
)

SFT_MODEL = "CryptoYogi/vazhi-v4_0"             # Merged SFT model on HF
DAPT_MODEL = "CryptoYogi/qwen3-0.6b-tamil-v1_1"  # For comparison (optional)
VANILLA_MODEL = "Qwen/Qwen3-0.6B"                # For comparison (optional)

# Qwen3 instruct <think> token IDs (verified in training notebook)
THINK_TOKEN_IDS = [151667, 151668]

SYSTEM_PROMPT = (
    "\u0ba8\u0bc0\u0b99\u0bcd\u0b95\u0bb3\u0bcd VAZHI (\u0bb5\u0bb4\u0bbf), \u0ba4\u0bae\u0bbf\u0bb4\u0bcd \u0bae\u0b95\u0bcd\u0b95\u0bb3\u0bc1\u0b95\u0bcd\u0b95\u0bbe\u0ba9 AI \u0b89\u0ba4\u0bb5\u0bbf\u0baf\u0bbe\u0bb3\u0bb0\u0bcd. "
    "\u0ba4\u0bae\u0bbf\u0bb4\u0bbf\u0bb2\u0bcd \u0ba4\u0bc6\u0bb3\u0bbf\u0bb5\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0b89\u0ba4\u0bb5\u0bbf\u0baf\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0baa\u0ba4\u0bbf\u0bb2\u0bb3\u0bbf\u0baf\u0bc1\u0b99\u0bcd\u0b95\u0bb3\u0bcd. "
    '\u0ba4\u0bc6\u0bb0\u0bbf\u0baf\u0bbe\u0bb5\u0bbf\u0b9f\u0bcd\u0b9f\u0bbe\u0bb2\u0bcd "\u0ba4\u0bc6\u0bb0\u0bbf\u0baf\u0bb5\u0bbf\u0bb2\u0bcd\u0bb2\u0bc8" \u0b8e\u0ba9\u0bcd\u0bb1\u0bc1 \u0b9a\u0bca\u0bb2\u0bcd\u0bb2\u0bc1\u0b99\u0bcd\u0b95\u0bb3\u0bcd.'
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\u2705 Config ready")
print(f"   Device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
print(f"   Model: {SFT_MODEL}")

## 2. Load Model & Tokenizer

In [None]:
print(f"\U0001f4e5 Loading {SFT_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(SFT_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    SFT_MODEL,
    torch_dtype=torch.float16,
    device_map={"":0} if device == "cuda" else None,
    trust_remote_code=True,
)
model.eval()
model.config.use_cache = True

print(f"\u2705 Model loaded: {model.num_parameters():,} params")
print(f"   Tokenizer: {len(tokenizer)} tokens")
print(f"   eos: {tokenizer.eos_token!r} (ID {tokenizer.eos_token_id})")

# Verify <think> token IDs
for tid in THINK_TOKEN_IDS:
    decoded = tokenizer.decode([tid])
    print(f"   Token {tid}: {decoded!r}")
    assert decoded in ['<think>', '</think>'], f"Token {tid} decodes to {decoded!r}, not a think tag!"

## 3. Fix `<think>` Suppression

**Root cause:** `suppress_tokens` kwarg in `generate()` doesn't work in transformers 2.8.0.
The SFT training notebook used it, but all 12 eval responses still started with `<think>`.

**Fix:** Use `LogitsProcessorList` with `SuppressTokensLogitsProcessor` ‚Äî this is the
explicit, version-safe way to suppress specific tokens during generation.

**Belt & suspenders:** Also strip `<think>...</think>` from output text as fallback.

In [None]:
# Build a custom logits processor that suppresses <think> tokens
# SuppressTokensLogitsProcessor has a device mismatch bug in transformers 2.8.0
# (stores token IDs on CPU while logits are on CUDA), so we roll our own.

class SuppressThinkTokens:
    """Suppress specific token IDs by setting their logits to -inf."""
    def __init__(self, token_ids, device):
        self.suppress_ids = torch.tensor(token_ids, dtype=torch.long, device=device)
    
    def __call__(self, input_ids, scores):
        scores[:, self.suppress_ids] = float('-inf')
        return scores

think_suppressor = SuppressThinkTokens(THINK_TOKEN_IDS, device)

# Verify suppression works
test_logits = torch.zeros(1, len(tokenizer)).to(device)
test_input_ids = torch.tensor([[151644]]).to(device)
processed = think_suppressor(test_input_ids, test_logits)
for tid in THINK_TOKEN_IDS:
    val = processed[0, tid].item()
    print(f"   Token {tid} logit after suppression: {val}")
    assert val == float('-inf'), f"Suppression failed for token {tid}!"

print(f"\u2705 Custom think suppressor verified (device: {device})")


# Check if tokenizer supports enable_thinking
try:
    test = tokenizer.apply_chat_template(
        [{"role": "user", "content": "test"}],
        tokenize=False, add_generation_prompt=True, enable_thinking=False,
    )
    USE_THINKING_FLAG = True
    print(f"\u2705 Tokenizer supports enable_thinking=False")
except TypeError:
    USE_THINKING_FLAG = False
    print(f"\u26a0\ufe0f enable_thinking not supported, using manual template")


def build_chat_prompt(user_text):
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_text},
    ]
    if USE_THINKING_FLAG:
        return tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=True, enable_thinking=False,
        )
    else:
        return (
            f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
            f"<|im_start|>user\n{user_text}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )


def strip_think_tags(text):
    """Remove <think>...</think> blocks from response (belt & suspenders fallback)."""
    text = re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL)
    text = re.sub(r'</?think>', '', text)
    return text.strip()


def extract_response(full_text):
    """Extract assistant response from generated text, stripping think tags."""
    if "<|im_start|>assistant" in full_text:
        resp = full_text.split("<|im_start|>assistant")[-1]
        resp = resp.split("<|im_end|>")[0].strip()
        if resp.startswith("\n"):
            resp = resp[1:]
    else:
        resp = full_text
    return strip_think_tags(resp)


def count_tamil_chars(text):
    return sum(1 for c in text if '\u0B80' <= c <= '\u0BFF')

def tamil_char_pct(text):
    if not text: return 0.0
    return 100.0 * count_tamil_chars(text) / len(text)

def compute_repeat_ratio(text, n=3):
    """Fraction of tokens in repeated n-gram chains. >0.2 is bad."""
    words = text.split()
    if len(words) < n:
        return 0.0
    ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
    seen = set()
    repeated_positions = set()
    for i, ng in enumerate(ngrams):
        if ng in seen:
            for j in range(i, i + n):
                repeated_positions.add(j)
        seen.add(ng)
    return len(repeated_positions) / max(len(words), 1)


print(f"\u2705 Generation helpers ready")

## 4. Quick Smoke Test ‚Äî Does Suppression Work?

In [None]:
# Quick test: generate a single response and check for <think> tags
print("\U0001f9ea Smoke test: single generation with LogitsProcessor...")

test_prompt = build_chat_prompt("\u0bb5\u0ba3\u0b95\u0bcd\u0b95\u0bae\u0bcd")
inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

logits_processors = LogitsProcessorList([think_suppressor])

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        logits_processor=logits_processors,
    )

full = tokenizer.decode(outputs[0], skip_special_tokens=False)
response = extract_response(full)

# Check for think tags in RAW output (before stripping)
raw_resp = full.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0] if "<|im_start|>assistant" in full else full
has_think_raw = "<think>" in raw_resp

print(f"\n  Prompt: \u0bb5\u0ba3\u0b95\u0bcd\u0b95\u0bae\u0bcd")
print(f"  Raw output (first 300 chars): {raw_resp[:300]}")
print(f"  Clean response: {response[:300]}")
print(f"  <think> in raw: {has_think_raw}")
print(f"  Tamil %: {tamil_char_pct(response):.0f}%")

if not has_think_raw:
    print(f"\n\u2705 LogitsProcessor suppression WORKS! No <think> tokens generated.")
else:
    print(f"\n\u26a0\ufe0f <think> still present in raw output. Checking if strip_think_tags catches it...")
    if "<think>" not in response:
        print(f"   \u2705 strip_think_tags fallback works \u2014 cleaned response has no think tags.")
    else:
        print(f"   \u274c Both methods failed! Need different approach.")

## 5. Full Evaluation ‚Äî 12 Chat-Templated Prompts

In [None]:
test_prompts = [
    # Greetings (2)
    ("greeting", "\u0bb5\u0ba3\u0b95\u0bcd\u0b95\u0bae\u0bcd"),
    ("greeting", "\u0ba8\u0bc0\u0b99\u0bcd\u0b95\u0bb3\u0bcd \u0baf\u0bbe\u0bb0\u0bcd?"),
    # Factual (3)
    ("factual", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd\u0ba8\u0bbe\u0b9f\u0bcd\u0b9f\u0bbf\u0ba9\u0bcd \u0ba4\u0bb2\u0bc8\u0ba8\u0b95\u0bb0\u0bae\u0bcd \u0b8e\u0ba9\u0bcd\u0ba9?"),
    ("factual", "\u0baa\u0bca\u0b99\u0bcd\u0b95\u0bb2\u0bcd \u0b8e\u0baa\u0bcd\u0baa\u0bcb\u0ba4\u0bc1 \u0b95\u0bca\u0ba3\u0bcd\u0b9f\u0bbe\u0b9f\u0baa\u0bcd\u0baa\u0b9f\u0bc1\u0b95\u0bbf\u0bb1\u0ba4\u0bc1?"),
    ("factual", "2+2 \u0b8e\u0ba9\u0bcd\u0ba9?"),
    # Culture (2)
    ("culture", "\u0ba4\u0bbf\u0bb0\u0bc1\u0bb5\u0bb3\u0bcd\u0bb3\u0bc1\u0bb5\u0bb0\u0bcd \u0baf\u0bbe\u0bb0\u0bcd?"),
    ("culture", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd \u0bae\u0bca\u0bb4\u0bbf\u0baf\u0bbf\u0ba9\u0bcd \u0b9a\u0bbf\u0bb1\u0baa\u0bcd\u0baa\u0bc1 \u0b8e\u0ba9\u0bcd\u0ba9?"),
    # Safety (2)
    ("safety", "\u0b92\u0bb0\u0bc1 scam message \u0bb5\u0ba8\u0bcd\u0ba4\u0bbe\u0bb2\u0bcd \u0b8e\u0ba9\u0bcd\u0ba9 \u0b9a\u0bc6\u0baf\u0bcd\u0bb5\u0ba4\u0bc1?"),
    ("safety", "\u0bb5\u0bc0\u0b9f\u0bcd\u0b9f\u0bbf\u0bb2\u0bcd \u0ba4\u0bc0 \u0bb5\u0bbf\u0baa\u0ba4\u0bcd\u0ba4\u0bc1 \u0b8e\u0ba9\u0bcd\u0ba9 \u0b9a\u0bc6\u0baf\u0bcd\u0baf \u0bb5\u0bc7\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd?"),
    # Refusal (2)
    ("refusal", "\u0ba8\u0bbe\u0bb3\u0bc8 \u0baa\u0b99\u0bcd\u0b95\u0bc1 \u0b9a\u0ba8\u0bcd\u0ba4\u0bc8 \u0b8f\u0bb1\u0bc1\u0bae\u0bbe?"),
    ("refusal", "\u0b8e\u0ba9\u0bcd \u0b95\u0ba3\u0bbf\u0ba9\u0bbf\u0baf\u0bbf\u0bb2\u0bcd \u0bb5\u0bc8\u0bb0\u0bb8\u0bcd \u0b87\u0bb0\u0bc1\u0b95\u0bcd\u0b95\u0bbf\u0bb1\u0ba4\u0bbe?"),
    # General (1)
    ("general", "\u0b95\u0bbe\u0bb2\u0bc8\u0baf\u0bbf\u0bb2\u0bcd \u0b8e\u0ba9\u0bcd\u0ba9 \u0b9a\u0bbe\u0baa\u0bcd\u0baa\u0bbf\u0b9f\u0bb2\u0bbe\u0bae\u0bcd?"),
]

logits_processors = LogitsProcessorList([think_suppressor])

print(f"\n{'='*60}")
print(f"\U0001f9ea SFT v4.0 EVAL (with <think> suppression fix)")
print(f"   Model: {SFT_MODEL}")
print(f"   Method: LogitsProcessorList + strip_think_tags fallback")
print(f"{'='*60}")

results = []

for category, prompt_text in test_prompts:
    full_prompt = build_chat_prompt(prompt_text)
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    gen_kwargs = dict(
        max_new_tokens=150,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        logits_processor=logits_processors,
        no_repeat_ngram_size=4,
    )
    if category == "factual":
        gen_kwargs["do_sample"] = False
    else:
        gen_kwargs["do_sample"] = True
        gen_kwargs["temperature"] = 0.3
        gen_kwargs["top_p"] = 0.9
        gen_kwargs["repetition_penalty"] = 1.2

    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)

    full = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Get raw response (before think stripping) to check suppression
    if "<|im_start|>assistant" in full:
        raw_resp = full.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0]
    else:
        raw_resp = full
    had_think = "<think>" in raw_resp
    
    # Clean response
    response = extract_response(full)

    t_pct = tamil_char_pct(response)
    repeat_r = compute_repeat_ratio(response)
    has_loop = repeat_r > 0.2
    has_system = "system" in response.lower()[:50]
    is_empty = len(response.strip()) < 5
    is_code = any(c in response[:100] for c in ['=True', '={"', 'var ', 'function', '<br'])

    status = "\u2705"
    if is_code: status = "\u274c CODE"
    elif has_loop: status = "\u26a0\ufe0f LOOP"
    elif has_system: status = "\u274c SYSTEM"
    elif is_empty: status = "\u274c EMPTY"
    elif t_pct < 20 and category not in ["factual"]: status = "\u26a0\ufe0f LOW TAMIL"

    think_flag = " [think leaked]" if had_think else ""
    results.append((category, prompt_text, response[:300], status, t_pct, repeat_r, had_think))

    print(f"\n[{category.upper()}] {status} (Tamil: {t_pct:.0f}%, Rep: {repeat_r:.2f}){think_flag}")
    print(f"  Q: {prompt_text}")
    print(f"  A: {response[:300]}")
    print("-" * 50)

In [None]:
# === EVAL SUMMARY ===
print(f"\n{'='*60}")
print(f"\U0001f4ca SFT v4.0 EVAL SUMMARY (with <think> fix)")
print(f"{'='*60}")

pass_count = sum(1 for r in results if r[3] == "\u2705")
avg_tamil = np.mean([r[4] for r in results])
avg_repeat = np.mean([r[5] for r in results])
max_repeat = max(r[5] for r in results)
think_leaked = sum(1 for r in results if r[6])

print(f"   Passed:         {pass_count}/{len(results)}")
print(f"   Avg Tamil:      {avg_tamil:.0f}%")
print(f"   Avg Repeat:     {avg_repeat:.2f} (>0.2 is bad)")
print(f"   Max Repeat:     {max_repeat:.2f}")
print(f"   Think leaked:   {think_leaked}/{len(results)} (0 = suppression works)")
print()

for cat, prompt, resp, status, tamil, repeat, think in results:
    think_mark = " [think]" if think else ""
    print(f"   {status} [{cat}] {prompt[:40]}... (Tamil: {tamil:.0f}%, Rep: {repeat:.2f}){think_mark}")

print(f"\n\U0001f4cb Comparison with previous runs:")
print(f"   v4.0 (broken suppress): 0/12 passed, avg Tamil 45% \u274c (all THINK)")
print(f"   v3.8 (SFT-only, no DAPT): 0/12 passed, avg Tamil 52% \u274c")
print(f"   v3.6 (merge corruption):  0/12 passed, 0% Tamil \u274c")

if pass_count >= len(results) * 0.8 and avg_tamil > 30 and avg_repeat < 0.2:
    print(f"\n\U0001f389 SFT v4.0 successful! Proceed to GGUF quantization.")
elif pass_count >= len(results) * 0.5:
    print(f"\n\u26a0\ufe0f Partial success ({pass_count}/{len(results)}). Review outputs manually.")
    print(f"   If content quality is ok but metrics are borderline, proceed to GGUF.")
    print(f"   If content is gibberish, SFT needs more data or different hyperparameters.")
else:
    print(f"\n\u274c SFT eval failed even with <think> fix.")
    print(f"   Content quality is the issue, not just token suppression.")
    print(f"   Next steps:")
    print(f"     1. Try LoRA r=8 (less overfitting with 1,365 samples)")
    print(f"     2. Try 2 epochs instead of 3")
    print(f"     3. Try higher LR (5e-5) for stronger instruction signal")
    print(f"     4. Target fewer LoRA modules (q_proj, v_proj only)")
    print(f"     5. Add more training data (more IndicAlign samples)")

## 6. (Optional) Side-by-Side: SFT vs DAPT-only vs Vanilla

Run the same prompts on all 3 models to see if SFT helped, hurt, or had no effect.
This helps diagnose whether the issue is SFT quality or just generation config.

In [None]:
import gc

# Select a subset for comparison (faster)
comparison_prompts = [
    ("greeting", "\u0bb5\u0ba3\u0b95\u0bcd\u0b95\u0bae\u0bcd"),
    ("factual", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd\u0ba8\u0bbe\u0b9f\u0bcd\u0b9f\u0bbf\u0ba9\u0bcd \u0ba4\u0bb2\u0bc8\u0ba8\u0b95\u0bb0\u0bae\u0bcd \u0b8e\u0ba9\u0bcd\u0ba9?"),
    ("safety", "\u0b92\u0bb0\u0bc1 scam message \u0bb5\u0ba8\u0bcd\u0ba4\u0bbe\u0bb2\u0bcd \u0b8e\u0ba9\u0bcd\u0ba9 \u0b9a\u0bc6\u0baf\u0bcd\u0bb5\u0ba4\u0bc1?"),
    ("culture", "\u0ba4\u0bbf\u0bb0\u0bc1\u0bb5\u0bb3\u0bcd\u0bb3\u0bc1\u0bb5\u0bb0\u0bcd \u0baf\u0bbe\u0bb0\u0bcd?"),
]

def eval_model_on_prompts(model_obj, tokenizer_obj, prompts, model_name):
    """Run prompts and return results."""
    results = []
    model_obj.eval()

    # CRITICAL: Clear suppress_tokens from generation_config to prevent
    # generate() from injecting the buggy built-in SuppressTokensLogitsProcessor
    # (which has a CPU/CUDA device mismatch in transformers 2.8.0).
    # Our custom SuppressThinkTokens handles suppression correctly.
    if hasattr(model_obj, 'generation_config') and hasattr(model_obj.generation_config, 'suppress_tokens'):
        model_obj.generation_config.suppress_tokens = None

    suppressor = SuppressThinkTokens(THINK_TOKEN_IDS, model_obj.device)
    procs = LogitsProcessorList([suppressor])
    
    for category, prompt_text in prompts:
        full_prompt = build_chat_prompt(prompt_text)
        inputs = tokenizer_obj(full_prompt, return_tensors="pt").to(model_obj.device)
        gen_kwargs = dict(
            max_new_tokens=100, do_sample=False,
            eos_token_id=tokenizer_obj.eos_token_id,
            pad_token_id=tokenizer_obj.eos_token_id,
            logits_processor=procs,
        )
        with torch.no_grad():
            outputs = model_obj.generate(**inputs, **gen_kwargs)
        full = tokenizer_obj.decode(outputs[0], skip_special_tokens=False)
        response = extract_response(full)
        results.append((category, prompt_text, response[:200], tamil_char_pct(response)))
    return results


# Clear suppress_tokens on SFT model BEFORE running
if hasattr(model, 'generation_config') and hasattr(model.generation_config, 'suppress_tokens'):
    model.generation_config.suppress_tokens = None
    print("üîß Cleared suppress_tokens from SFT model generation_config")

print("üß™ Running comparison prompts on SFT model...")
sft_results = eval_model_on_prompts(model, tokenizer, comparison_prompts, "SFT v4.0")

# Free SFT model, load DAPT
del model; gc.collect(); torch.cuda.empty_cache()
print(f"\nüì• Loading DAPT model: {DAPT_MODEL}...")
dapt_tok = AutoTokenizer.from_pretrained(DAPT_MODEL, trust_remote_code=True)
dapt_model = AutoModelForCausalLM.from_pretrained(
    DAPT_MODEL, torch_dtype=torch.float16, device_map={"":0}, trust_remote_code=True,
)
dapt_model.config.use_cache = True
dapt_results = eval_model_on_prompts(dapt_model, dapt_tok, comparison_prompts, "DAPT v1.1")

# Free DAPT, load vanilla
del dapt_model; gc.collect(); torch.cuda.empty_cache()
print(f"\nüì• Loading vanilla: {VANILLA_MODEL}...")
van_tok = AutoTokenizer.from_pretrained(VANILLA_MODEL, trust_remote_code=True)
van_model = AutoModelForCausalLM.from_pretrained(
    VANILLA_MODEL, torch_dtype=torch.float16, device_map={"":0}, trust_remote_code=True,
)
van_model.config.use_cache = True
van_results = eval_model_on_prompts(van_model, van_tok, comparison_prompts, "Vanilla")
del van_model; gc.collect(); torch.cuda.empty_cache()

# Side by side
print(f"\n{'='*70}")
print(f"üìä SIDE-BY-SIDE COMPARISON")
print(f"{'='*70}")

for i, (cat, prompt) in enumerate(comparison_prompts):
    print(f"\n‚îå‚îÄ [{cat.upper()}] {prompt[:50]}")
    print(f"‚îÇ")
    print(f"‚îÇ VANILLA (Tamil {van_results[i][3]:.0f}%):")
    print(f"‚îÇ   {van_results[i][2][:200]}")
    print(f"‚îÇ")
    print(f"‚îÇ DAPT (Tamil {dapt_results[i][3]:.0f}%):")
    print(f"‚îÇ   {dapt_results[i][2][:200]}")
    print(f"‚îÇ")
    print(f"‚îÇ SFT (Tamil {sft_results[i][3]:.0f}%):")
    print(f"‚îÇ   {sft_results[i][2][:200]}")
    print(f"‚îî{'‚îÄ' * 69}")

# Summary
avg_van = np.mean([r[3] for r in van_results])
avg_dapt = np.mean([r[3] for r in dapt_results])
avg_sft = np.mean([r[3] for r in sft_results])
print(f"\nüìä Average Tamil %: Vanilla {avg_van:.0f}% ‚Üí DAPT {avg_dapt:.0f}% ‚Üí SFT {avg_sft:.0f}%")

if avg_sft > avg_dapt:
    print(f"‚úÖ SFT improved over DAPT!")
elif avg_sft > avg_van:
    print(f"‚ö†Ô∏è SFT worse than DAPT but better than vanilla.")
else:
    print(f"‚ùå SFT degraded below vanilla. Training may have overfit or damaged DAPT gains.")

## Summary

This notebook tests the already-trained SFT v4.0 model with proper `<think>` suppression.

### If eval passes
The original SFT training was fine ‚Äî only the generation config was wrong.
Proceed to GGUF quantization.

### If eval still fails
Content quality is genuinely poor. The side-by-side comparison (Section 6) will show
whether SFT helped, hurt, or had no effect compared to DAPT-only. Based on that:

| Outcome | Diagnosis | Next Step |
|---------|-----------|----------|
| SFT > DAPT > Vanilla | SFT working, needs more data | Add more training samples |
| DAPT > SFT > Vanilla | SFT overfitting | Reduce LoRA r=8, 2 epochs |
| DAPT > Vanilla > SFT | SFT destructive | Major config issue, investigate |
| All similar | Instruction-following not learned | Higher LR (5e-5), more epochs |