# VAZHI SFT v3.3 - Clean Training Notebook

**Key fixes:**
1. FP32 training (Qwen3 has internal bf16 ops that P100/T4 can't handle)
2. `torch_dtype=torch.float16` for model loading
3. ChatML-only data (no raw text mixing)
4. SKIP_DATA_PREP logic to avoid redundant extraction
5. Single GPU forced

**Target:** Kaggle P100 (16GB)

## 1. Install Dependencies

**IMPORTANT:** After running this cell, **RESTART the session** (Runtime ‚Üí Restart session)

In [None]:
# Install dependencies
!pip install -q -U \
  "transformers>=4.51.0" \
  "accelerate>=0.34.2" \
  "peft>=0.12.0" \
  "trl>=0.12.0" \
  "bitsandbytes>=0.43.3" \
  "datasets>=2.21.0" \
  "huggingface_hub>=0.24.7"

print("‚úÖ Dependencies installed")
print("‚ö†Ô∏è RESTART THE SESSION NOW (Runtime ‚Üí Restart session)")

## 2. Imports & Configuration

In [None]:
# Force single GPU BEFORE importing torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import random
import re
import torch
from collections import defaultdict
from datasets import load_dataset, Dataset
from tqdm.auto import tqdm
from huggingface_hub import login, HfApi, dataset_info

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

# Config
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Repos
EXISTING_DATASET = "CryptoYogi/vazhi-tamil-v05"
BALANCED_DATASET = "CryptoYogi/vazhi-tamil-sft-v3_3"
BASE_MODEL = "Qwen/Qwen3-0.6B"
OUTPUT_MODEL = "CryptoYogi/vazhi-qwen3-v3_3"

# System prompt
SYSTEM_PROMPT = "‡Æ®‡ØÄ‡Æô‡Øç‡Æï‡Æ≥‡Øç VAZHI (‡Æµ‡Æ¥‡Æø), ‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç ‡ÆÆ‡Æï‡Øç‡Æï‡Æ≥‡ØÅ‡Æï‡Øç‡Æï‡Ææ‡Æ© AI ‡Æâ‡Æ§‡Æµ‡Æø‡ÆØ‡Ææ‡Æ≥‡Æ∞‡Øç. ‡Æ§‡ÆÆ‡Æø‡Æ¥‡Æø‡Æ≤‡Øç ‡Æ§‡ØÜ‡Æ≥‡Æø‡Æµ‡Ææ‡Æï‡Æµ‡ØÅ‡ÆÆ‡Øç ‡Æâ‡Æ§‡Æµ‡Æø‡ÆØ‡Ææ‡Æï‡Æµ‡ØÅ‡ÆÆ‡Øç ‡Æ™‡Æ§‡Æø‡Æ≤‡Æ≥‡Æø‡ÆØ‡ØÅ‡Æô‡Øç‡Æï‡Æ≥‡Øç. ‡Æ§‡ØÜ‡Æ∞‡Æø‡ÆØ‡Ææ‡Æµ‡Æø‡Æü‡Øç‡Æü‡Ææ‡Æ≤‡Øç \"‡Æ§‡ØÜ‡Æ∞‡Æø‡ÆØ‡Æµ‡Æø‡Æ≤‡Øç‡Æ≤‡Øà\" ‡Æé‡Æ©‡Øç‡Æ±‡ØÅ ‡Æö‡Øä‡Æ≤‡Øç‡Æ≤‡ØÅ‡Æô‡Øç‡Æï‡Æ≥‡Øç."

print(f"‚úÖ Configuration loaded")
print(f"   PyTorch: {torch.__version__}")
print(f"   CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
print(f"   Source dataset: {EXISTING_DATASET}")
print(f"   Balanced dataset: {BALANCED_DATASET}")
print(f"   Base model: {BASE_MODEL}")

In [None]:
# Login to HuggingFace
from kaggle_secrets import UserSecretsClient
secrets = UserSecretsClient()
hf_token = secrets.get_secret("HF_TOKEN")
login(token=hf_token)
print("‚úÖ Logged in to HuggingFace")

## 3. Helper Functions

In [None]:
def count_tamil_chars(text):
    """Count Tamil Unicode characters."""
    return sum(1 for c in text if '\u0B80' <= c <= '\u0BFF')

def is_good_tamil_sample(text, min_tamil_pct=30, min_len=10):
    """Check if text has enough Tamil content."""
    if not text or len(text) < min_len:
        return False
    tamil_pct = 100 * count_tamil_chars(text) / len(text)
    return tamil_pct >= min_tamil_pct

def clean_text(text):
    """Clean text by removing extra whitespace."""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def is_chatml_formatted(text):
    """Check if text is in ChatML format."""
    return '<|im_start|>' in text and '<|im_end|>' in text

def is_kural(text):
    """Check if text contains Thirukkural references."""
    kural_markers = ['‡Æï‡ØÅ‡Æ±‡Æ≥‡Øç', '‡Æ§‡Æø‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡ØÅ‡Æ±‡Æ≥‡Øç', 'kural', 'thirukkural', '‡ÆÖ‡Æ§‡Æø‡Æï‡Ææ‡Æ∞‡ÆÆ‡Øç']
    text_lower = text.lower()
    return any(m in text_lower for m in kural_markers)

def to_chatml(instruction, output):
    """Convert instruction/output to ChatML format."""
    return f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{instruction}<|im_end|>
<|im_start|>assistant
{output}<|im_end|>"""

print("‚úÖ Helper functions defined")

## 4. Check if Dataset Already Exists

In [None]:
# Check if balanced dataset already exists
SKIP_DATA_PREP = False

try:
    info = dataset_info(BALANCED_DATASET)
    print(f"‚úÖ Dataset {BALANCED_DATASET} already exists!")
    print(f"   Created: {info.created_at}")
    print(f"\nüöÄ SKIPPING data preparation - will load directly for training")
    SKIP_DATA_PREP = True
except Exception as e:
    print(f"üìù Dataset {BALANCED_DATASET} not found. Will create it.")
    SKIP_DATA_PREP = False

## 5. Data Preparation (Skip if dataset exists)

In [None]:
if not SKIP_DATA_PREP:
    def extract_from_indicaling(config_name, max_samples):
        """Extract Tamil samples from IndicAlign."""
        print(f"\nüìö Loading {config_name}...")
        try:
            ds = load_dataset("ai4bharat/indic-align", config_name, split="train", streaming=True)
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error: {e}")
            return []
        
        samples = []
        seen = set()
        
        for item in tqdm(ds, desc=config_name, total=max_samples*5):
            if len(samples) >= max_samples:
                break
            
            tamil = item.get('tam_Taml', [])
            if not tamil or not isinstance(tamil, list) or len(tamil) == 0:
                continue
            
            turns = tamil[0]
            if not isinstance(turns, list) or len(turns) < 2:
                continue
            
            user_msg = clean_text(str(turns[0]))
            assistant_msg = clean_text(str(turns[1]))
            
            if not is_good_tamil_sample(user_msg) or not is_good_tamil_sample(assistant_msg):
                continue
            
            key = user_msg[:100]
            if key in seen:
                continue
            seen.add(key)
            
            samples.append({"instruction": user_msg, "output": assistant_msg, "source": config_name})
        
        print(f"   ‚úÖ Extracted {len(samples)} samples")
        return samples
    
    print("‚úÖ Extraction function defined")
else:
    print("‚è≠Ô∏è Skipping - dataset exists")

In [None]:
if not SKIP_DATA_PREP:
    print("üöÄ Extracting from IndicAlign...")
    diverse_samples = []
    diverse_samples.extend(extract_from_indicaling("Dolly_T", 300))
    diverse_samples.extend(extract_from_indicaling("WikiHow", 250))
    diverse_samples.extend(extract_from_indicaling("Wiki_Conv", 300))
    diverse_samples.extend(extract_from_indicaling("OpenAssistant_T", 200))
    print(f"\nüìä Total from IndicAlign: {len(diverse_samples)}")
else:
    print("‚è≠Ô∏è Skipping IndicAlign extraction")

In [None]:
if not SKIP_DATA_PREP:
    # Manual samples for short answers and behavior
    manual_samples = [
        # Geography
        {"instruction": "‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡Æ®‡Ææ‡Æü‡Øç‡Æü‡Æø‡Æ©‡Øç ‡Æ§‡Æ≤‡Øà‡Æ®‡Æï‡Æ∞‡ÆÆ‡Øç ‡Æé‡Æ©‡Øç‡Æ©?", "output": "‡Æö‡ØÜ‡Æ©‡Øç‡Æ©‡Øà.", "source": "manual"},
        {"instruction": "‡Æá‡Æ®‡Øç‡Æ§‡Æø‡ÆØ‡Ææ‡Æµ‡Æø‡Æ©‡Øç ‡Æ§‡Æ≤‡Øà‡Æ®‡Æï‡Æ∞‡ÆÆ‡Øç ‡Æé‡Æ§‡ØÅ?", "output": "‡Æ™‡ØÅ‡Æ§‡ØÅ ‡Æ§‡Æø‡Æ≤‡Øç‡Æ≤‡Æø.", "source": "manual"},
        {"instruction": "‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡Æ®‡Ææ‡Æü‡Øç‡Æü‡Æø‡Æ©‡Øç ‡ÆÆ‡Ææ‡Æµ‡Æü‡Øç‡Æü‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡Æé‡Æ§‡Øç‡Æ§‡Æ©‡Øà?", "output": "38 ‡ÆÆ‡Ææ‡Æµ‡Æü‡Øç‡Æü‡Æô‡Øç‡Æï‡Æ≥‡Øç.", "source": "manual"},
        
        # Basic facts
        {"instruction": "‡Æö‡ØÇ‡Æ∞‡Æø‡ÆØ‡Æ©‡Øç ‡Æé‡Æ®‡Øç‡Æ§ ‡Æ§‡Æø‡Æö‡Øà‡ÆØ‡Æø‡Æ≤‡Øç ‡Æâ‡Æ§‡Æø‡Æï‡Øç‡Æï‡ØÅ‡ÆÆ‡Øç?", "output": "‡Æï‡Æø‡Æ¥‡Æï‡Øç‡Æï‡ØÅ ‡Æ§‡Æø‡Æö‡Øà‡ÆØ‡Æø‡Æ≤‡Øç.", "source": "manual"},
        {"instruction": "‡Æí‡Æ∞‡ØÅ ‡Æµ‡Ææ‡Æ∞‡Æ§‡Øç‡Æ§‡Æø‡Æ≤‡Øç ‡Æé‡Æ§‡Øç‡Æ§‡Æ©‡Øà ‡Æ®‡Ææ‡Æü‡Øç‡Æï‡Æ≥‡Øç?", "output": "‡Æè‡Æ¥‡ØÅ ‡Æ®‡Ææ‡Æü‡Øç‡Æï‡Æ≥‡Øç.", "source": "manual"},
        {"instruction": "2+2 ‡Æé‡Æ©‡Øç‡Æ©?", "output": "4.", "source": "manual"},
        {"instruction": "10 x 10 ‡Æé‡Æ©‡Øç‡Æ©?", "output": "100.", "source": "manual"},
        
        # Tamil culture
        {"instruction": "‡Æ™‡Øä‡Æô‡Øç‡Æï‡Æ≤‡Øç ‡Æé‡Æ™‡Øç‡Æ™‡Øã‡Æ§‡ØÅ ‡Æï‡Øä‡Æ£‡Øç‡Æü‡Ææ‡Æü‡Æ™‡Øç‡Æ™‡Æü‡ØÅ‡Æï‡Æø‡Æ±‡Æ§‡ØÅ?", "output": "‡Æ§‡Øà ‡ÆÆ‡Ææ‡Æ§‡ÆÆ‡Øç ‡ÆÆ‡ØÅ‡Æ§‡Æ≤‡Øç ‡Æ®‡Ææ‡Æ≥‡Øç (‡Æú‡Æ©‡Æµ‡Æ∞‡Æø 14 ‡ÆÖ‡Æ≤‡Øç‡Æ≤‡Æ§‡ØÅ 15).", "source": "manual"},
        {"instruction": "‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç ‡Æé‡Æ¥‡ØÅ‡Æ§‡Øç‡Æ§‡ØÅ‡Æï‡Øç‡Æï‡Æ≥‡Øç ‡Æé‡Æ§‡Øç‡Æ§‡Æ©‡Øà?", "output": "247 ‡Æé‡Æ¥‡ØÅ‡Æ§‡Øç‡Æ§‡ØÅ‡Æï‡Øç‡Æï‡Æ≥‡Øç.", "source": "manual"},
        {"instruction": "‡Æö‡Æø‡Æ≤‡Æ™‡Øç‡Æ™‡Æ§‡Æø‡Æï‡Ææ‡Æ∞‡Æ§‡Øç‡Æ§‡Øà ‡Æé‡Æ¥‡ØÅ‡Æ§‡Æø‡ÆØ‡Æµ‡Æ∞‡Øç ‡ÆØ‡Ææ‡Æ∞‡Øç?", "output": "‡Æá‡Æ≥‡Æô‡Øç‡Æï‡Øã‡Æµ‡Æü‡Æø‡Æï‡Æ≥‡Øç.", "source": "manual"},
        
        # Behavior
        {"instruction": "‡Æµ‡Æ£‡Æï‡Øç‡Æï‡ÆÆ‡Øç", "output": "‡Æµ‡Æ£‡Æï‡Øç‡Æï‡ÆÆ‡Øç! ‡Æ®‡Ææ‡Æ©‡Øç ‡Æµ‡Æ¥‡Æø. ‡Æâ‡Æô‡Øç‡Æï‡Æ≥‡ØÅ‡Æï‡Øç‡Æï‡ØÅ ‡Æé‡Æ™‡Øç‡Æ™‡Æü‡Æø ‡Æâ‡Æ§‡Æµ ‡Æµ‡Øá‡Æ£‡Øç‡Æü‡ØÅ‡ÆÆ‡Øç?", "source": "behavior"},
        {"instruction": "hi", "output": "‡Æµ‡Æ£‡Æï‡Øç‡Æï‡ÆÆ‡Øç! ‡Æé‡Æ™‡Øç‡Æ™‡Æü‡Æø ‡Æâ‡Æ§‡Æµ‡Æ≤‡Ææ‡ÆÆ‡Øç?", "source": "behavior"},
        {"instruction": "2050-‡Æ≤‡Øç ‡ÆØ‡Ææ‡Æ∞‡Øç ‡Æ™‡Æø‡Æ∞‡Æ§‡ÆÆ‡Æ∞‡Øç ‡ÆÜ‡Æµ‡Ææ‡Æ∞‡Øç?", "output": "‡Æé‡Æ§‡Æø‡Æ∞‡Øç‡Æï‡Ææ‡Æ≤‡Æ§‡Øç‡Æ§‡Øà ‡Æï‡Æ£‡Æø‡Æï‡Øç‡Æï ‡Æé‡Æ©‡Øç‡Æ©‡Ææ‡Æ≤‡Øç ‡ÆÆ‡ØÅ‡Æü‡Æø‡ÆØ‡Ææ‡Æ§‡ØÅ. ‡Æ§‡ØÜ‡Æ∞‡Æø‡ÆØ‡Æµ‡Æø‡Æ≤‡Øç‡Æ≤‡Øà.", "source": "behavior"},
        {"instruction": "‡Æé‡Æ©‡Øç‡Æ©‡ØÅ‡Æü‡Øà‡ÆØ ‡Æï‡Æü‡Æµ‡ØÅ‡Æö‡Øç‡Æö‡Øä‡Æ≤‡Øç ‡Æé‡Æ©‡Øç‡Æ©?", "output": "‡Æâ‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡Æ§‡Æ©‡Æø‡Æ™‡Øç‡Æ™‡Æü‡Øç‡Æü ‡Æ§‡Æï‡Æµ‡Æ≤‡Øç‡Æï‡Æ≥‡Øç ‡Æé‡Æ©‡Øç‡Æ©‡Æø‡Æü‡ÆÆ‡Øç ‡Æá‡Æ≤‡Øç‡Æ≤‡Øà.", "source": "behavior"},
        {"instruction": "‡Æ®‡Æ©‡Øç‡Æ±‡Æø", "output": "‡ÆÆ‡Æï‡Æø‡Æ¥‡Øç‡Æö‡Øç‡Æö‡Æø! ‡Æµ‡Øá‡Æ±‡ØÅ ‡Æâ‡Æ§‡Æµ‡Æø ‡Æ§‡Øá‡Æµ‡Øà‡Æ™‡Øç‡Æ™‡Æü‡Øç‡Æü‡Ææ‡Æ≤‡Øç ‡Æï‡Øá‡Æ≥‡ØÅ‡Æô‡Øç‡Æï‡Æ≥‡Øç.", "source": "behavior"},
    ]
    
    diverse_samples.extend(manual_samples)
    print(f"üìä Total after manual samples: {len(diverse_samples)}")
else:
    print("‚è≠Ô∏è Skipping manual samples")

In [None]:
if not SKIP_DATA_PREP:
    print(f"\nüìö Loading existing dataset from {EXISTING_DATASET}...")
    existing_ds = load_dataset(EXISTING_DATASET, split="train")
    print(f"   Loaded {len(existing_ds)} samples")
    
    # Filter for ChatML ONLY
    existing_kural_chatml = []
    existing_other_chatml = []
    
    for item in tqdm(existing_ds, desc="Filtering ChatML"):
        text = item.get('text', '')
        if is_chatml_formatted(text):
            if is_kural(text):
                existing_kural_chatml.append({"text": text})
            else:
                existing_other_chatml.append({"text": text})
    
    print(f"\nüìä ChatML samples:")
    print(f"   Kural: {len(existing_kural_chatml)}")
    print(f"   Other: {len(existing_other_chatml)}")
else:
    print("‚è≠Ô∏è Skipping existing dataset loading")

In [None]:
if not SKIP_DATA_PREP:
    # Downsample Thirukkural to ~25%
    total_other = len(existing_other_chatml)
    target_kural_pct = 0.25
    target_kural_count = int(target_kural_pct * total_other / (1 - target_kural_pct))
    
    if len(existing_kural_chatml) > target_kural_count:
        downsampled_kural = random.sample(existing_kural_chatml, target_kural_count)
    else:
        downsampled_kural = existing_kural_chatml
    
    print(f"üéØ Downsampled Kural: {len(existing_kural_chatml)} ‚Üí {len(downsampled_kural)}")
    
    # Convert diverse to ChatML and combine
    diverse_formatted = [{"text": to_chatml(s["instruction"], s["output"])} for s in diverse_samples]
    
    final_samples = []
    final_samples.extend(downsampled_kural)
    final_samples.extend(existing_other_chatml)
    final_samples.extend(diverse_formatted)
    random.shuffle(final_samples)
    
    print(f"\nüìä Final dataset: {len(final_samples)} samples")
    
    # Verify 100% ChatML
    chatml_count = sum(1 for s in final_samples if is_chatml_formatted(s["text"]))
    if chatml_count != len(final_samples):
        raise ValueError("Not all samples are ChatML formatted!")
    print(f"‚úÖ 100% ChatML verified")
else:
    print("‚è≠Ô∏è Skipping dataset combination")

In [None]:
if not SKIP_DATA_PREP:
    # Save and upload
    os.makedirs("/kaggle/working/balanced_sft", exist_ok=True)
    
    split_idx = int(0.95 * len(final_samples))
    train_samples = final_samples[:split_idx]
    val_samples = final_samples[split_idx:]
    
    with open("/kaggle/working/balanced_sft/train.jsonl", 'w') as f:
        for s in train_samples:
            f.write(json.dumps(s, ensure_ascii=False) + '\n')
    
    with open("/kaggle/working/balanced_sft/val.jsonl", 'w') as f:
        for s in val_samples:
            f.write(json.dumps(s, ensure_ascii=False) + '\n')
    
    print(f"üíæ Saved: {len(train_samples)} train, {len(val_samples)} val")
    
    # Upload to HuggingFace
    api = HfApi()
    api.create_repo(BALANCED_DATASET, repo_type="dataset", exist_ok=True)
    api.upload_file(
        path_or_fileobj="/kaggle/working/balanced_sft/train.jsonl",
        path_in_repo="train.jsonl",
        repo_id=BALANCED_DATASET,
        repo_type="dataset"
    )
    api.upload_file(
        path_or_fileobj="/kaggle/working/balanced_sft/val.jsonl",
        path_in_repo="val.jsonl",
        repo_id=BALANCED_DATASET,
        repo_type="dataset"
    )
    print(f"‚úÖ Uploaded to {BALANCED_DATASET}")
else:
    print("‚è≠Ô∏è Skipping upload")

## 6. Load Dataset for Training

In [None]:
print(f"üìö Loading balanced dataset...")
balanced_ds = load_dataset(BALANCED_DATASET, split="train")
print(f"‚úÖ Loaded {len(balanced_ds)} samples")

# Verify ChatML format
sample = balanced_ds[0]['text'][:200]
print(f"\nüìù Sample: {sample}...")
if "<|im_start|>" in sample:
    print("‚úÖ ChatML format verified")

## 7. Load Model with 4-bit Quantization

In [None]:
print("\nüì• Loading model and tokenizer...")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.padding_side = "right"

# 4-bit quantization - use float16 compute dtype
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model - MUST specify torch_dtype to avoid bf16 default
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map={"":0},
    trust_remote_code=True
)

# Prepare for training
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.use_cache = False

print(f"‚úÖ Model loaded: {model.num_parameters():,} params")
print(f"   torch_dtype: float16")

## 8. Add LoRA Adapters

In [None]:
# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Verify no bf16 parameters
bf16_count = sum(1 for _, p in model.named_parameters() if p.dtype == torch.bfloat16)
if bf16_count > 0:
    print(f"‚ö†Ô∏è Found {bf16_count} bf16 parameters - converting to fp16")
    for name, param in model.named_parameters():
        if param.dtype == torch.bfloat16:
            param.data = param.data.to(torch.float16)
else:
    print("‚úÖ No bf16 parameters")

## 9. Training (FP32 Mode for P100 Compatibility)

In [None]:
# FP32 training - Qwen3 has internal bf16 ops that P100 can't handle with AMP
sft_config = SFTConfig(
    output_dir="/kaggle/working/vazhi-v3_3",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=25,
    save_steps=200,
    save_total_limit=2,
    fp16=False,  # DISABLED - Qwen3 has internal bf16
    bf16=False,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    optim="paged_adamw_8bit",
    report_to="none",
    dataset_text_field="text",
    max_length=512,
    packing=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=balanced_ds,
    args=sft_config,
    processing_class=tokenizer,
)

print("‚úÖ Trainer initialized (FP32 mode)")
print(f"   Epochs: 2")
print(f"   Batch size: 1 x 16 = 16 effective")
print(f"   Mode: FP32 (P100 compatible)")

In [None]:
# Train!
print("\nüöÄ Starting training...")
trainer.train()
print("\n‚úÖ Training complete!")

## 10. Save and Push to HuggingFace

In [None]:
print("üíæ Saving model...")
trainer.save_model("/kaggle/working/vazhi-v3_3-final")

print("üîÄ Merging LoRA weights...")
merged_model = model.merge_and_unload()

# Push to HuggingFace
api = HfApi()
api.create_repo(OUTPUT_MODEL, exist_ok=True)

print(f"üì§ Pushing to {OUTPUT_MODEL}...")
merged_model.push_to_hub(OUTPUT_MODEL, private=False)
tokenizer.push_to_hub(OUTPUT_MODEL, private=False)

print(f"\n‚úÖ Model uploaded: https://huggingface.co/{OUTPUT_MODEL}")

## 11. Test the Model

In [None]:
merged_model.config.use_cache = True

test_prompts = [
    "‡Æµ‡Æ£‡Æï‡Øç‡Æï‡ÆÆ‡Øç",
    "‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡Æ®‡Ææ‡Æü‡Øç‡Æü‡Æø‡Æ©‡Øç ‡Æ§‡Æ≤‡Øà‡Æ®‡Æï‡Æ∞‡ÆÆ‡Øç ‡Æé‡Æ©‡Øç‡Æ©?",
    "2+2 ‡Æé‡Æ©‡Øç‡Æ©?",
    "‡Æ™‡Øä‡Æô‡Øç‡Æï‡Æ≤‡Øç ‡Æé‡Æ™‡Øç‡Æ™‡Øã‡Æ§‡ØÅ ‡Æï‡Øä‡Æ£‡Øç‡Æü‡Ææ‡Æü‡Æ™‡Øç‡Æ™‡Æü‡ØÅ‡Æï‡Æø‡Æ±‡Æ§‡ØÅ?",
]

print("\nüß™ Testing model...\n")

for prompt in test_prompts:
    full_prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    
    inputs = tokenizer(full_prompt, return_tensors="pt").to(merged_model.device)
    
    with torch.no_grad():
        outputs = merged_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.5,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.3,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1]
        response = response.split("<|im_end|>")[0].strip()
    
    print(f"Q: {prompt}")
    print(f"A: {response}")
    print("-" * 50)

## Summary

- **FP32 training** - Qwen3 has internal bf16 ops incompatible with P100
- **ChatML only** - No raw text mixing
- **Thirukkural ~25%** - Downsampled from 71%
- **4-bit QLoRA** - Memory efficient