# VAZHI Dataset Factory v4.1.3 ‚Äî Stage 3 Re-compose (CPU only)

**Fixes two bucket shortfalls from v4.1.2:**
1. **vazhi_packs** (2,429 ‚Üí ~3,000): bypass quality_score + PPL filters for hand-curated product data
2. **safety** (105 ‚Üí ~2,000+): route ALL Toxic_Matrix/HHRLHF_T by subset name, not narrow wordlist

Loads curated dataset from HF, re-composes, re-uploads. No GPU needed. <2 min.

In [1]:
# 1. Config & Dependencies
!pip install -q datasets huggingface_hub

import re
import random
from collections import Counter
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login, HfApi

VERSION = "4.1.3"
CURATED_DATASET = "CryptoYogi/vazhi-curated-tamil-qa-v1"
OUTPUT_DATASET = "CryptoYogi/vazhi-tamil-sft-v4_1"
DAPT_MODEL = "CryptoYogi/qwen3-0.6b-tamil-v1_1"
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

SFT_MAX_SEQ_LENGTH = 2048

SYSTEM_PROMPT = (
    "\u0ba8\u0bc0\u0b99\u0bcd\u0b95\u0bb3\u0bcd VAZHI (\u0bb5\u0bb4\u0bbf), \u0ba4\u0bae\u0bbf\u0bb4\u0bcd \u0bae\u0b95\u0bcd\u0b95\u0bb3\u0bc1\u0b95\u0bcd\u0b95\u0bbe\u0ba9 AI \u0b89\u0ba4\u0bb5\u0bbf\u0baf\u0bbe\u0bb3\u0bb0\u0bcd. "
    "\u0ba4\u0bae\u0bbf\u0bb4\u0bbf\u0bb2\u0bcd \u0ba4\u0bc6\u0bb3\u0bbf\u0bb5\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0b89\u0ba4\u0bb5\u0bbf\u0baf\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0baa\u0ba4\u0bbf\u0bb2\u0bb3\u0bbf\u0baf\u0bc1\u0b99\u0bcd\u0b95\u0bb3\u0bcd. "
    '\u0ba4\u0bc6\u0bb0\u0bbf\u0baf\u0bbe\u0bb5\u0bbf\u0b9f\u0bcd\u0b9f\u0bbe\u0bb2\u0bcd "\u0ba4\u0bc6\u0bb0\u0bbf\u0baf\u0bb5\u0bbf\u0bb2\u0bcd\u0bb2\u0bc8" \u0b8e\u0ba9\u0bcd\u0bb1\u0bc1 \u0b9a\u0bca\u0bb2\u0bcd\u0bb2\u0bc1\u0b99\u0bcd\u0b95\u0bb3\u0bcd.'
)

BUCKET_TARGETS = {
    "vazhi_packs":    {"min": 2500, "target": 3000, "max": 3000},
    "handcrafted":    {"min": 100,  "target": 147,  "max": 200},
    "general":        {"min": 300,  "target": 500,  "max": 700},
    "indicalign":     {"min": 10000, "target": 12000, "max": 14000},
    "safety":         {"min": 1500, "target": 2000,  "max": 2500},
}

print(f"\u2705 Config loaded: Dataset Factory v{VERSION}")
print(f"   Curated source: {CURATED_DATASET}")
print(f"   Output: {OUTPUT_DATASET}")

‚úÖ Config loaded: Dataset Factory v4.1.3
   Curated source: CryptoYogi/vazhi-curated-tamil-qa-v1
   Output: CryptoYogi/vazhi-tamil-sft-v4_1


In [2]:
# 2. HF Login
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("\u2705 Logged in via Colab secrets")
except Exception:
    try:
        from kaggle_secrets import UserSecretsClient
        secrets = UserSecretsClient()
        hf_token = secrets.get_secret("HF_TOKEN")
        login(token=hf_token)
        print("\u2705 Logged in via Kaggle secrets")
    except Exception:
        login()
        print("\u2705 Logged in interactively")

‚úÖ Logged in via Colab secrets


In [3]:
# 3. Load curated dataset from HF
print(f"Loading curated dataset: {CURATED_DATASET}")
curated_ds = load_dataset(CURATED_DATASET, split="train")
df = curated_ds.to_list()
print(f"\u2705 Loaded {len(df):,} curated samples")
print(f"   Columns: {curated_ds.column_names}")
print(f"   Sources: {Counter(r['source'] for r in df).most_common()}")
print(f"   Subsets with safety data:")
for subset in ('Toxic_Matrix', 'HHRLHF_T'):
    count = sum(1 for r in df if r['subset'] == subset)
    print(f"     {subset}: {count:,}")

Loading curated dataset: CryptoYogi/vazhi-curated-tamil-qa-v1


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/47.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/35047 [00:00<?, ? examples/s]

‚úÖ Loaded 35,047 curated samples
   Columns: ['instruction', 'output', 'source', 'subset', 'char_length', 'tamil_pct', 'lang_id', 'lang_confidence', 'heuristic_flags', 'repetition', 'toxicity_flags', 'is_safety_sample', 'is_duplicate', 'perplexity', 'embedding_cluster', 'auto_category', 'domain', 'quality_score', 'tokenized_length']
   Sources: [('indicalign', 24178), ('general', 7765), ('vazhi_packs', 2968), ('handcrafted', 136)]
   Subsets with safety data:
     Toxic_Matrix: 2,996
     HHRLHF_T: 3,481


In [4]:
# 4. Helper functions

def to_chatml(instruction, output, system_prompt=None):
    sp = system_prompt or SYSTEM_PROMPT
    return (
        f"<|im_start|>system\n{sp}<|im_end|>\n"
        f"<|im_start|>user\n{instruction}<|im_end|>\n"
        f"<|im_start|>assistant\n{output}<|im_end|>"
    )

CHATML_PATTERN = re.compile(
    r'<\|im_start\|>system\n.+?<\|im_end\|>\n'
    r'<\|im_start\|>user\n(.+?)<\|im_end\|>\n'
    r'<\|im_start\|>assistant\n(.+?)<\|im_end\|>',
    re.DOTALL
)

def validate_chatml_strict(text):
    match = CHATML_PATTERN.search(text)
    if not match:
        return False, "no ChatML structure found"
    if len(match.group(1).strip()) < 2:
        return False, "empty user content"
    if len(match.group(2).strip()) < 2:
        return False, "empty assistant content"
    return True, "ok"

def is_verbatim_kural_qa(question, answer):
    verbatim_patterns = [
        r'\u0b95\u0bc1\u0bb1\u0bb3\u0bcd\s*\d+\s*(\u0b8e\u0ba9\u0bcd\u0ba9|\u0b9a\u0bca\u0bb2\u0bcd\u0bb2\u0bc1)',
        r'\u0ba4\u0bbf\u0bb0\u0bc1\u0b95\u0bcd\u0b95\u0bc1\u0bb1\u0bb3\u0bbf\u0ba9\u0bcd\s+\u0bae\u0bc1\u0ba4\u0bb2\u0bcd\s+\u0b95\u0bc1\u0bb1\u0bb3\u0bcd',
    ]
    for pat in verbatim_patterns:
        if re.search(pat, question, re.IGNORECASE):
            return True
    return False

print("\u2705 Helper functions ready")

‚úÖ Helper functions ready


In [5]:
# 5. Stage 3A: Filtering (FIXED ‚Äî source-aware for vazhi_packs + safety routing by subset)

print("Applying Stage 3 filters (v4.1.3 fixes)...")
before = len(df)

# Standard filters
df = [s for s in df if not s["is_duplicate"]]
print(f"  After dedup: {len(df):,} (-{before - len(df):,})")

b = len(df)
df = [s for s in df if s["tokenized_length"] <= SFT_MAX_SEQ_LENGTH]
print(f"  After token \u2264 {SFT_MAX_SEQ_LENGTH}: {len(df):,} (-{b - len(df):,})")

b = len(df)
df = [s for s in df if s["lang_id"] == "ta" or s["source"] in ("vazhi_packs", "handcrafted")]
print(f"  After lang_id (source-aware): {len(df):,} (-{b - len(df):,})")

b = len(df)
df = [s for s in df if len(s["heuristic_flags"]) == 0]
print(f"  After clean heuristics: {len(df):,} (-{b - len(df):,})")

# FIX: Toxicity filter ‚Äî keep safety subsets even if flagged
b = len(df)
df = [s for s in df if len(s["toxicity_flags"]) == 0 or s["is_safety_sample"] or s["subset"] in ("Toxic_Matrix", "HHRLHF_T")]
print(f"  After toxicity (safety-aware): {len(df):,} (-{b - len(df):,})")

# FIX: Quality filter ‚Äî bypass for hand-curated product data
b = len(df)
df = [s for s in df if s["quality_score"] >= 0.45 or s["source"] in ("vazhi_packs", "handcrafted")]
print(f"  After quality \u2265 0.45 (source-aware): {len(df):,} (-{b - len(df):,})")

# FIX: PPL filter ‚Äî bypass for hand-curated product data
b = len(df)
df = [s for s in df if s["perplexity"] is None or s["perplexity"] < 200 or s["source"] in ("vazhi_packs", "handcrafted")]
print(f"  After PPL < 200 (source-aware): {len(df):,} (-{b - len(df):,})")

print(f"\n\u2705 Filtering: {before:,} \u2192 {len(df):,}")
filtered_sources = Counter(s["source"] for s in df)
print(f"\nFiltered pool by source:")
for src, count in filtered_sources.most_common():
    print(f"  {src}: {count:,}")

# Show safety-eligible samples
safety_eligible = sum(1 for s in df if s["subset"] in ("Toxic_Matrix", "HHRLHF_T"))
print(f"\nSafety-eligible (Toxic_Matrix + HHRLHF_T): {safety_eligible:,}")

Applying Stage 3 filters (v4.1.3 fixes)...
  After dedup: 35,047 (-0)
  After token ‚â§ 2048: 26,327 (-8,720)
  After lang_id (source-aware): 26,327 (-0)
  After clean heuristics: 26,327 (-0)
  After toxicity (safety-aware): 26,076 (-251)
  After quality ‚â• 0.45 (source-aware): 26,076 (-0)
  After PPL < 200 (source-aware): 26,076 (-0)

‚úÖ Filtering: 35,047 ‚Üí 26,076

Filtered pool by source:
  indicalign: 15,396
  general: 7,590
  vazhi_packs: 2,957
  handcrafted: 133

Safety-eligible (Toxic_Matrix + HHRLHF_T): 6,450


In [6]:
# 6. Stage 3B: Composition (FIXED ‚Äî route safety by subset name)

# FIX: Route ALL Toxic_Matrix/HHRLHF_T to safety bucket (not just wordlist matches)
safety_pool = [s for s in df if s["subset"] in ("Toxic_Matrix", "HHRLHF_T")]
non_safety = [s for s in df if s["subset"] not in ("Toxic_Matrix", "HHRLHF_T")]

source_pools = {}
for s in non_safety:
    source_pools.setdefault(s["source"], []).append(s)
source_pools["safety"] = safety_pool

print("Composing final dataset...")
composed = {}
total_composed = 0

for bucket_name, targets in BUCKET_TARGETS.items():
    pool = source_pools.get(bucket_name, [])
    target = targets["target"]
    min_count = targets["min"]
    max_count = targets["max"]

    if len(pool) < min_count:
        print(f"  \u26a0\ufe0f {bucket_name}: only {len(pool):,} available, min is {min_count}")
        selected = pool
    elif len(pool) <= target:
        selected = pool
    else:
        use_count = min(target, max_count)
        pool_sorted = sorted(pool, key=lambda x: x["quality_score"], reverse=True)
        selected = pool_sorted[:use_count]

    composed[bucket_name] = selected
    total_composed += len(selected)
    print(f"  {bucket_name}: {len(selected):,} / {len(pool):,} (target: {target}, range: {min_count}-{max_count})")

print(f"\n\u2705 Composition: {total_composed:,} total")

all_met = True
for bucket_name, targets in BUCKET_TARGETS.items():
    actual = len(composed.get(bucket_name, []))
    if actual < targets["min"]:
        print(f"  \u274c {bucket_name}: {actual} < min {targets['min']}")
        all_met = False
if all_met:
    print("\u2705 All bucket minimums met")

Composing final dataset...
  vazhi_packs: 2,957 / 2,957 (target: 3000, range: 2500-3000)
  handcrafted: 133 / 133 (target: 147, range: 100-200)
  general: 500 / 7,590 (target: 500, range: 300-700)
  ‚ö†Ô∏è indicalign: only 8,946 available, min is 10000
  indicalign: 8,946 / 8,946 (target: 12000, range: 10000-14000)
  safety: 2,000 / 6,450 (target: 2000, range: 1500-2500)

‚úÖ Composition: 14,536 total
  ‚ùå indicalign: 8946 < min 10000


In [7]:
# 7. Stage 3C: ChatML Conversion + Validation

all_samples = []
chatml_failures = 0

for bucket_name, samples in composed.items():
    for s in samples:
        if s["source"] == "vazhi_packs" and is_verbatim_kural_qa(s["instruction"], s["output"]):
            continue
        text = to_chatml(s["instruction"], s["output"])
        valid, reason = validate_chatml_strict(text)
        if not valid:
            chatml_failures += 1
            continue
        all_samples.append({
            "text": text, "bucket": bucket_name,
            "source": s["source"], "subset": s["subset"],
            "domain": s.get("domain", "general"),
            "quality_score": s["quality_score"],
            "tokenized_length": s["tokenized_length"],
        })

random.shuffle(all_samples)
print(f"\u2705 ChatML: {len(all_samples):,} valid, {chatml_failures} failures")

bucket_counts = Counter(s["bucket"] for s in all_samples)
print(f"\n\U0001f4ca Bucket distribution:")
for bucket, count in sorted(bucket_counts.items()):
    pct = 100 * count / len(all_samples)
    print(f"  {bucket}: {count:,} ({pct:.1f}%)")

‚úÖ ChatML: 14,535 valid, 0 failures

üìä Bucket distribution:
  general: 500 (3.4%)
  handcrafted: 133 (0.9%)
  indicalign: 8,946 (61.5%)
  safety: 2,000 (13.8%)
  vazhi_packs: 2,956 (20.3%)


In [8]:
# 8. Stage 3D: Stratified Train/Eval Split (90/10)

EVAL_RATIO = 0.10
train_samples = []
eval_samples = []

by_bucket = {}
for s in all_samples:
    by_bucket.setdefault(s["bucket"], []).append(s)

for bucket, samples in by_bucket.items():
    random.shuffle(samples)
    n_eval = max(1, int(len(samples) * EVAL_RATIO))
    eval_samples.extend(samples[:n_eval])
    train_samples.extend(samples[n_eval:])

random.shuffle(train_samples)
random.shuffle(eval_samples)

print(f"\U0001f4ca Split: Train={len(train_samples):,} Eval={len(eval_samples):,}")
print(f"  Eval ratio: {len(eval_samples) / (len(train_samples) + len(eval_samples)):.1%}")

max_tok = max(s["tokenized_length"] for s in all_samples)
print(f"  Max tokens: {max_tok} (limit: {SFT_MAX_SEQ_LENGTH})")
assert max_tok <= SFT_MAX_SEQ_LENGTH

üìä Split: Train=13,083 Eval=1,452
  Eval ratio: 10.0%
  Max tokens: 2048 (limit: 2048)


In [9]:
# 9. Stage 3E: Upload to HuggingFace + Summary

train_ds = Dataset.from_list(train_samples)
eval_ds = Dataset.from_list(eval_samples)
dataset_dict = DatasetDict({"train": train_ds, "validation": eval_ds})

api = HfApi()
api.create_repo(OUTPUT_DATASET, repo_type="dataset", exist_ok=True)
dataset_dict.push_to_hub(OUTPUT_DATASET)

print(f"\n\u2705 Uploaded: https://huggingface.co/datasets/{OUTPUT_DATASET}")
print(f"   Train: {len(train_ds):,} | Eval: {len(eval_ds):,}")

print(f"\n{'=' * 60}")
print(f"VAZHI Dataset Factory v{VERSION} \u2014 COMPLETE")
print(f"{'=' * 60}")

print(f"\n  Curated source: {CURATED_DATASET} ({len(curated_ds):,})")
print(f"  Final SFT:      {OUTPUT_DATASET} ({len(all_samples):,})")

print(f"\n  Buckets:")
for bucket, count in sorted(bucket_counts.items()):
    target = BUCKET_TARGETS[bucket]
    status = "\u2705" if count >= target["min"] else "\u26a0\ufe0f"
    print(f"    {status} {bucket}: {count:,} (target: {target['target']})")

print(f"\n  v4.1.3 fixes:")
print(f"    \u2705 vazhi_packs bypass quality_score + PPL filters")
print(f"    \u2705 Safety routed by subset (Toxic_Matrix + HHRLHF_T), not wordlist")

print(f"\n  Sample outputs (2 per bucket):")
shown = Counter()
for s in all_samples:
    if shown[s['bucket']] < 2:
        shown[s['bucket']] += 1
        print(f"\n  [{s['bucket'].upper()}] source={s['source']} subset={s['subset']} quality={s['quality_score']:.3f}")
        match = CHATML_PATTERN.search(s["text"])
        if match:
            print(f"    Q: {match.group(1)[:100]}")
            print(f"    A: {match.group(2)[:150]}")
    if all(shown[b] >= 2 for b in BUCKET_TARGETS):
        break

print(f"\n\u2705 Done! Next: SFT training with LoRA (r=8, q_proj+v_proj, 2 epochs)")
print(f"   Base model: {DAPT_MODEL}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   6%|6         |  527kB / 8.52MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  54%|#####3    |  528kB /  983kB            

README.md:   0%|          | 0.00/587 [00:00<?, ?B/s]


‚úÖ Uploaded: https://huggingface.co/datasets/CryptoYogi/vazhi-tamil-sft-v4_1
   Train: 13,083 | Eval: 1,452

VAZHI Dataset Factory v4.1.3 ‚Äî COMPLETE

  Curated source: CryptoYogi/vazhi-curated-tamil-qa-v1 (35,047)
  Final SFT:      CryptoYogi/vazhi-tamil-sft-v4_1 (14,535)

  Buckets:
    ‚úÖ general: 500 (target: 500)
    ‚úÖ handcrafted: 133 (target: 147)
    ‚ö†Ô∏è indicalign: 8,946 (target: 12000)
    ‚úÖ safety: 2,000 (target: 2000)
    ‚úÖ vazhi_packs: 2,956 (target: 3000)

  v4.1.3 fixes:
    ‚úÖ vazhi_packs bypass quality_score + PPL filters
    ‚úÖ Safety routed by subset (Toxic_Matrix + HHRLHF_T), not wordlist

  Sample outputs (2 per bucket):

  [INDICALIGN] source=indicalign subset=Dolly_T quality=0.862
    Q: ‡ÆÜ‡Æ≤‡Øç‡Æ∏‡Øç‡Æµ‡ØÜ‡Æ©‡Øç‡Æ∏‡Øç‡Æï‡Æ©‡Øç ‡Æé‡Æ©‡Øç‡Æ±‡Ææ‡Æ≤‡Øç ‡Æé‡Æ©‡Øç‡Æ©?
    A: ‡ÆÜ‡Æ≤‡Øç‡Æ∏‡Øç‡Æµ‡ØÜ‡Æ©‡Øç‡Æ∏‡Øç‡Æï‡Æ©‡Øç ‡Æ∏‡Øç‡Æµ‡ØÄ‡Æü‡Æ©‡Æø‡Æ©‡Øç ‡ÆÆ‡ØÅ‡Æ§‡Æ≤‡Øç ‡Æï‡Ææ‡Æ≤‡Øç‡Æ™‡Æ®‡Øç‡Æ§‡ØÅ ‡Æ≤‡ØÄ‡Æï‡Øç ‡ÆÜ‡Æï‡ØÅ‡ÆÆ‡Øç. ‡Æá‡Æ§‡ØÅ 16 ‡ÆÖ‡Æ£‡Æø‡Æï‡Æ≥‡Øà‡Æï