# VAZHI DAPT v1.0 — Base vs DAPT Comparison

Quick comparison: run the same 8 Tamil text continuation prompts on:
1. **Vanilla base:** `Qwen/Qwen3-0.6B-Base` (no Tamil training)
2. **DAPT'd model:** `CryptoYogi/qwen3-0.6b-tamil` (375 steps on Sangraha Tamil)

**Platform:** Colab T4 or Kaggle T4 (1.2GB per model in fp16)

**Expected outcome:** Base model should produce English/code/gibberish for Tamil prompts. DAPT'd model should produce coherent Tamil continuations.

In [None]:
!pip install -q "transformers>=4.45.0,<5.0.0" "accelerate>=0.34.2" "torch"
print("\u2705 Dependencies installed")

In [None]:
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer

BASE_MODEL = "Qwen/Qwen3-0.6B-Base"
DAPT_MODEL = "CryptoYogi/qwen3-0.6b-tamil"

# Same eval prompts from DAPT training notebook
eval_prompts = [
    ("prose", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd\u0ba8\u0bbe\u0b9f\u0bc1 \u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe\u0bb5\u0bbf\u0ba9\u0bcd \u0ba4\u0bc6\u0ba9\u0bcd \u0baa\u0b95\u0bc1\u0ba4\u0bbf\u0baf\u0bbf\u0bb2\u0bcd \u0b85\u0bae\u0bc8\u0ba8\u0bcd\u0ba4\u0bc1\u0bb3\u0bcd\u0bb3 \u0b92\u0bb0\u0bc1 \u0bae\u0bbe\u0ba8\u0bbf\u0bb2\u0bae\u0bcd."),
    ("prose", "\u0baa\u0bca\u0b99\u0bcd\u0b95\u0bb2\u0bcd \u0ba4\u0bae\u0bbf\u0bb4\u0bb0\u0bcd\u0b95\u0bb3\u0bbf\u0ba9\u0bcd \u0bae\u0bc1\u0b95\u0bcd\u0b95\u0bbf\u0baf \u0ba4\u0bbf\u0bb0\u0bc1\u0ba8\u0bbe\u0bb3\u0bcd."),
    ("literature", "\u0bb5\u0bb3\u0bcd\u0bb3\u0bc1\u0bb5\u0bb0\u0bcd \u0b95\u0bc2\u0bb1\u0bbf\u0baf \u0b85\u0bb1\u0bae\u0bcd, \u0baa\u0bca\u0bb0\u0bc1\u0bb3\u0bcd, \u0b87\u0ba9\u0bcd\u0baa\u0bae\u0bcd \u0b8e\u0ba9\u0bcd\u0bb1 \u0bae\u0bc2\u0ba9\u0bcd\u0bb1\u0bc1"),
    ("knowledge", "\u0b9a\u0bbf\u0ba4\u0bcd\u0ba4 \u0bae\u0bb0\u0bc1\u0ba4\u0bcd\u0ba4\u0bc1\u0bb5\u0bae\u0bcd \u0b8e\u0ba9\u0bcd\u0baa\u0ba4\u0bc1 \u0ba4\u0bae\u0bbf\u0bb4\u0bcd \u0bae\u0b95\u0bcd\u0b95\u0bb3\u0bbf\u0ba9\u0bcd \u0baa\u0bbe\u0bb0\u0bae\u0bcd\u0baa\u0bb0\u0bbf\u0baf"),
    ("daily", "\u0b95\u0bbe\u0bb2\u0bc8\u0baf\u0bbf\u0bb2\u0bcd \u0b8e\u0bb4\u0bc1\u0ba8\u0bcd\u0ba4\u0ba4\u0bc1\u0bae\u0bcd \u0bae\u0bc1\u0ba4\u0bb2\u0bbf\u0bb2\u0bcd"),
    ("short", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd"),
    ("short", "\u0ba8\u0ba9\u0bcd\u0bb1\u0bbf"),
    ("mixed", "India has many languages. \u0ba4\u0bae\u0bbf\u0bb4\u0bcd is one of the"),
]

def count_tamil_chars(text):
    return sum(1 for c in text if '\u0B80' <= c <= '\u0BFF')

def tamil_char_pct(text):
    if not text:
        return 0.0
    return 100.0 * count_tamil_chars(text) / len(text)

def generate_responses(model, tokenizer, device):
    """Run all eval prompts and return results."""
    results = []
    model.eval()
    for category, prompt_text in eval_prompts:
        inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2,
                no_repeat_ngram_size=4,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )
        generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
        response = tokenizer.decode(generated_ids, skip_special_tokens=True)
        t_pct = tamil_char_pct(response)
        words = response.split()
        unique_ratio = len(set(words)) / max(len(words), 1)
        results.append({
            "category": category,
            "prompt": prompt_text,
            "response": response[:300],
            "tamil_pct": t_pct,
            "unique_ratio": unique_ratio,
            "length": len(response),
        })
    return results

print(f"\u2705 Config ready")
print(f"   Base:  {BASE_MODEL}")
print(f"   DAPT:  {DAPT_MODEL}")
print(f"   Prompts: {len(eval_prompts)}")

## Run Base Model (Vanilla Qwen3-0.6B)

In [None]:
print(f"\U0001f4e5 Loading vanilla base model: {BASE_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map={"":0} if device == "cuda" else None,
    trust_remote_code=True,
)
base_model.config.use_cache = True
print(f"\u2705 Base model loaded on {device}")

print(f"\n\U0001f9ea Running {len(eval_prompts)} eval prompts on BASE model...")
base_results = generate_responses(base_model, tokenizer, device)

for r in base_results:
    print(f"\n[{r['category'].upper()}] Tamil: {r['tamil_pct']:.0f}% | Unique: {r['unique_ratio']:.0%}")
    print(f"  Prompt: {r['prompt'][:60]}")
    print(f"  Output: {r['response'][:200]}")
    print("-" * 50)

# Free memory
del base_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()
print(f"\n\U0001f5d1\ufe0f Base model freed from GPU")

## Run DAPT Model (Tamil-adapted)

In [None]:
print(f"\U0001f4e5 Loading DAPT model: {DAPT_MODEL}")
dapt_model = AutoModelForCausalLM.from_pretrained(
    DAPT_MODEL,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map={"":0} if device == "cuda" else None,
    trust_remote_code=True,
)
dapt_model.config.use_cache = True
print(f"\u2705 DAPT model loaded on {device}")

print(f"\n\U0001f9ea Running {len(eval_prompts)} eval prompts on DAPT model...")
dapt_results = generate_responses(dapt_model, tokenizer, device)

for r in dapt_results:
    print(f"\n[{r['category'].upper()}] Tamil: {r['tamil_pct']:.0f}% | Unique: {r['unique_ratio']:.0%}")
    print(f"  Prompt: {r['prompt'][:60]}")
    print(f"  Output: {r['response'][:200]}")
    print("-" * 50)

del dapt_model
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()
print(f"\n\U0001f5d1\ufe0f DAPT model freed from GPU")

## Side-by-Side Comparison

In [None]:
import numpy as np

print("=" * 70)
print("\U0001f4ca SIDE-BY-SIDE COMPARISON: Base vs DAPT")
print("=" * 70)

print(f"\n{'Category':<12} {'Base Tamil%':>12} {'DAPT Tamil%':>12} {'Base Uniq':>10} {'DAPT Uniq':>10} {'Winner':>8}")
print("-" * 70)

base_wins = 0
dapt_wins = 0

for b, d in zip(base_results, dapt_results):
    winner = "DAPT" if d["tamil_pct"] > b["tamil_pct"] else ("BASE" if b["tamil_pct"] > d["tamil_pct"] else "TIE")
    if winner == "DAPT":
        dapt_wins += 1
    elif winner == "BASE":
        base_wins += 1
    print(f"{b['category']:<12} {b['tamil_pct']:>10.0f}% {d['tamil_pct']:>10.0f}% {b['unique_ratio']:>9.0%} {d['unique_ratio']:>9.0%} {winner:>8}")

print("-" * 70)

avg_base_tamil = np.mean([r["tamil_pct"] for r in base_results])
avg_dapt_tamil = np.mean([r["tamil_pct"] for r in dapt_results])
avg_base_uniq = np.mean([r["unique_ratio"] for r in base_results])
avg_dapt_uniq = np.mean([r["unique_ratio"] for r in dapt_results])

print(f"{'AVERAGE':<12} {avg_base_tamil:>10.0f}% {avg_dapt_tamil:>10.0f}% {avg_base_uniq:>9.0%} {avg_dapt_uniq:>9.0%}")
print()
print(f"\U0001f3c6 DAPT wins: {dapt_wins}/{len(eval_prompts)} | Base wins: {base_wins}/{len(eval_prompts)}")
print(f"   Tamil% improvement: {avg_base_tamil:.0f}% \u2192 {avg_dapt_tamil:.0f}% ({avg_dapt_tamil - avg_base_tamil:+.0f}%)")

if avg_dapt_tamil > avg_base_tamil + 5:
    print(f"\n\U0001f389 DAPT clearly improved Tamil fluency! Proceed to SFT.")
elif avg_dapt_tamil > avg_base_tamil:
    print(f"\n\u2705 DAPT shows marginal improvement. Consider more training tokens.")
elif avg_dapt_tamil < avg_base_tamil:
    print(f"\n\u26a0\ufe0f DAPT degraded Tamil output! Check training data quality.")
else:
    print(f"\n\U0001f914 No change. Base model may already know Tamil — check outputs manually.")

In [None]:
# Detailed side-by-side output for manual inspection
print("=" * 70)
print("\U0001f50d DETAILED OUTPUT COMPARISON")
print("=" * 70)

for b, d in zip(base_results, dapt_results):
    print(f"\n\u250c\u2500 [{b['category'].upper()}] Prompt: {b['prompt'][:60]}")
    print(f"\u2502")
    print(f"\u2502 BASE (Tamil {b['tamil_pct']:.0f}%):")
    print(f"\u2502   {b['response'][:200]}")
    print(f"\u2502")
    print(f"\u2502 DAPT (Tamil {d['tamil_pct']:.0f}%):")
    print(f"\u2502   {d['response'][:200]}")
    print(f"\u2514{'\u2500' * 69}")