# VAZHI Full Training

**Full training on all 11K+ samples with validated settings.**

**Validated Settings (from upper boundary test):**
- Learning rate: 5e-5 ✓
- Tokenizer: UNMODIFIED ✓
- Loss decreased: 3.39 → 3.00 ✓
- Responses: Coherent Tamil ✓

**This Run:**
- Training samples: ALL (~11,000)
- Epochs: 1 (full dataset is large enough)
- Expected time: ~2-3 hours

In [None]:
# CRITICAL: Force single GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Suppress TensorFlow/JAX CUDA factory warnings (Kaggle has all frameworks installed)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "3"

# Suppress harmless warnings
import warnings
warnings.filterwarnings("ignore", message=".*UnsupportedFieldAttributeWarning.*")
warnings.filterwarnings("ignore", message=".*'repr' attribute.*")
warnings.filterwarnings("ignore", message=".*'frozen' attribute.*")
warnings.filterwarnings("ignore", message=".*OrderedVocab.*holes.*")  # Gemma tokenizer quirk
warnings.filterwarnings("ignore", category=UserWarning)

# Suppress absl logging (JAX/TF)
import logging
logging.getLogger("absl").setLevel(logging.ERROR)

# Install dependencies
!pip install -q bitsandbytes peft trl accelerate datasets

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Step 1: Load Model WITHOUT Modifying Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Use our forked model (run Vazhi_Fork_Base_Model.ipynb first to create it)
model_name = "CryptoYogi/gemma-2b-tamil-base"

print(f"Using model: {model_name}")

# Load tokenizer - DO NOT MODIFY IT!
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("=" * 50)
print("TOKENIZER INFO (UNMODIFIED):")
print("=" * 50)
print(f"PAD token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")
print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")
print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
print(f"\n⚠️  IMPORTANT: We are NOT modifying the tokenizer!")

In [None]:
# Load model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    attn_implementation="eager",
)

# IMPORTANT: Align model config with tokenizer BEFORE training
# This prevents the "tokenizer has new PAD/BOS/EOS tokens" warning
model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

if hasattr(model, 'generation_config'):
    model.generation_config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.bos_token_id = tokenizer.bos_token_id
    model.generation_config.eos_token_id = tokenizer.eos_token_id

# Disable cache for training
model.config.use_cache = False

print(f"Model loaded! Memory: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"Token IDs aligned: PAD={model.config.pad_token_id}, BOS={model.config.bos_token_id}, EOS={model.config.eos_token_id}")

## Step 2: Test Model BEFORE Training

In [None]:
def test_model(prompt, max_tokens=100):
    """Test model with Alpaca format"""
    full_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    
    # Temporarily enable cache for generation if it was disabled
    original_cache = model.config.use_cache
    model.config.use_cache = True
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,  # Use ORIGINAL pad token
            use_cache=True,  # Explicitly enable for generation
        )
    
    # Restore original cache setting
    model.config.use_cache = original_cache
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "### Response:" in response:
        return response.split("### Response:")[-1].strip()
    return response

# Test questions covering different packs
test_questions = [
    "தமிழ்நாட்டின் தலைநகரம் எது?",  # General
    "திருக்குறள் யார் எழுதினார்?",  # Culture
    "OTP மோசடி என்றால் என்ன?",  # Security
]

print("=" * 60)
print("BEFORE TRAINING - Baseline Responses:")
print("=" * 60)
before_responses = []
for q in test_questions:
    resp = test_model(q)
    before_responses.append(resp)
    print(f"\nQ: {q}")
    print(f"A: {resp[:200]}")

## Step 3: Prepare Training Data

In [None]:
from datasets import load_dataset
import re

# Load full dataset
dataset = load_dataset("CryptoYogi/vazhi-tamil-v05")
print(f"Full dataset: {len(dataset['train'])} training, {len(dataset['validation'])} validation")

def format_to_alpaca(example):
    """Convert ChatML to Alpaca format"""
    if 'text' not in example or not example['text']:
        return {"text": ""}
    
    text = example['text']
    
    # Convert ChatML to Alpaca
    if '<|im_start|>' in text:
        user_match = re.search(r'<\|im_start\|>user\n(.+?)<\|im_end\|>', text, re.DOTALL)
        assistant_match = re.search(r'<\|im_start\|>assistant\n(.+?)<\|im_end\|>', text, re.DOTALL)
        
        if user_match and assistant_match:
            instruction = user_match.group(1).strip()
            output = assistant_match.group(1).strip()
            # Add EOS token at end for proper training
            formatted = f"### Instruction:\n{instruction}\n\n### Response:\n{output}{tokenizer.eos_token}"
            return {"text": formatted}
    
    # Already in correct format
    if '### Instruction' in text:
        if not text.endswith(tokenizer.eos_token):
            text = text + tokenizer.eos_token
        return {"text": text}
    
    return {"text": ""}

# FULL TRAINING - Use ALL samples
print("\n" + "=" * 60)
print("FULL TRAINING MODE - Using all samples")
print("=" * 60)

# Format ALL training data
formatted_train = dataset['train'].map(format_to_alpaca)
formatted_train = formatted_train.filter(lambda x: len(x['text']) > 50)

# Format ALL validation data
formatted_val = dataset['validation'].map(format_to_alpaca)
formatted_val = formatted_val.filter(lambda x: len(x['text']) > 50)

print(f"\nFormatted: {len(formatted_train)} train, {len(formatted_val)} validation")
print(f"\nSample:")
print(formatted_train[0]['text'][:300])

## Step 4: Setup LoRA with Conservative Settings

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# CONSERVATIVE LoRA config
lora_config = LoraConfig(
    r=4,              # Lower rank = gentler adaptation
    lora_alpha=8,     # alpha = 2*r is standard
    target_modules=["q_proj", "v_proj"],  # Only query and value, not all
    lora_dropout=0.1,  # Higher dropout for regularization
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## Step 5: Training with Ultra-Conservative Settings

In [None]:
from trl import SFTTrainer, SFTConfig
import time

# FULL TRAINING CONFIG - Validated settings from test run
training_config = SFTConfig(
    output_dir="./vazhi-full-training",
    
    # Dataset
    dataset_text_field="text",
    max_length=512,
    packing=False,
    
    # Batch size
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch = 8
    
    # Learning rate - validated at 5e-5
    learning_rate=5e-5,
    num_train_epochs=1,      # 1 epoch for full dataset
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    
    # Regularization
    weight_decay=0.01,
    max_grad_norm=0.3,
    
    # Optimizer
    optim="paged_adamw_8bit",
    
    # Precision
    fp16=False,
    bf16=True,
    
    # Evaluation - less frequent for large dataset
    eval_strategy="steps",
    eval_steps=100,
    
    # Logging
    logging_steps=50,
    
    # Checkpointing - save periodically for long training
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    
    # Speed
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    
    seed=42,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=formatted_train,
    eval_dataset=formatted_val,
    args=training_config,
)

# Calculate expected steps
total_steps = (len(formatted_train) // (2 * 4)) * 1  # samples / (batch * accum) * epochs
print(f"=" * 60)
print(f"FULL TRAINING")
print(f"=" * 60)
print(f"  Training samples: {len(formatted_train)}")
print(f"  Validation samples: {len(formatted_val)}")
print(f"  Estimated steps: ~{total_steps}")
print(f"  Learning rate: 5e-5 (validated)")
print(f"  Epochs: 1")
print(f"  Expected time: ~2-3 hours")
print(f"")
print(f"  Checkpoints saved every 200 steps")

In [None]:
# Run training
print("Starting training...")
start = time.time()

trainer_stats = trainer.train()

elapsed = time.time() - start
print(f"\n" + "=" * 60)
print(f"Training Complete!")
print(f"=" * 60)
print(f"Time: {elapsed / 60:.1f} minutes")
print(f"Final training loss: {trainer_stats.training_loss:.4f}")

## Step 6: Evaluate AFTER Training

In [None]:
# IMPORTANT: Prepare model for inference
# 1. Disable gradient checkpointing (not needed for inference)
# 2. Re-enable cache (needed for efficient generation)
model.gradient_checkpointing_disable()
model.config.use_cache = True

print("=" * 60)
print("AFTER TRAINING - Comparing Responses:")
print("=" * 60)

results = []
for i, q in enumerate(test_questions):
    after_resp = test_model(q)
    print(f"\n{'='*60}")
    print(f"Q: {q}")
    print(f"\nBEFORE: {before_responses[i][:150]}")
    print(f"\nAFTER:  {after_resp[:150]}")
    results.append(after_resp)

# Analyze results
import re

def analyze_response(text):
    """Check if response is coherent Tamil"""
    if not text or len(text) < 10:
        return False, 0
    tamil_chars = len(re.findall(r'[\u0B80-\u0BFF]', text))
    total_chars = len(text)
    tamil_pct = (tamil_chars / total_chars * 100) if total_chars > 0 else 0
    
    # Check for garbage patterns
    has_spaces = ' ' in text
    is_coherent = tamil_pct > 30 and has_spaces and len(text) > 20
    
    return is_coherent, tamil_pct

print(f"\n" + "=" * 60)
print("TRAINING RESULTS:")
print("=" * 60)
print(f"Final loss: {trainer_stats.training_loss:.4f}")

all_coherent = True
for i, resp in enumerate(results):
    coherent, tamil_pct = analyze_response(resp)
    status = "✓" if coherent else "✗"
    print(f"  Response {i+1}: {status} (Tamil: {tamil_pct:.0f}%)")
    if not coherent:
        all_coherent = False

# Updated success criteria (loss < 3.2 is good for Tamil LM)
if trainer_stats.training_loss < 3.2 and all_coherent:
    print(f"\n✅ TRAINING SUCCESSFUL!")
    print(f"   - Loss decreased to {trainer_stats.training_loss:.4f}")
    print(f"   - All responses are coherent Tamil")
    print(f"   - Ready to save and convert to GGUF!")
else:
    print(f"\n⚠️  TRAINING COMPLETED WITH ISSUES")
    if trainer_stats.training_loss >= 3.2:
        print(f"   - Loss could be lower: {trainer_stats.training_loss:.4f}")
    if not all_coherent:
        print(f"   - Some responses may need review")

## Step 7: If Passed - Save Adapter

Only run this if validation passed!

In [None]:
# Save the LoRA adapter locally
adapter_path = "./vazhi-lora-adapter"
model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)
print(f"Adapter saved to: {adapter_path}")

# List saved files
import os
total_size = 0
for f in os.listdir(adapter_path):
    size = os.path.getsize(os.path.join(adapter_path, f)) / 1e6
    total_size += size
    print(f"  {f}: {size:.2f} MB")
print(f"\nTotal adapter size: {total_size:.2f} MB")

In [None]:
## Summary

**Training Configuration:**
- Base Model: CryptoYogi/gemma-2b-tamil-base
- Training Samples: ~11,000
- Learning Rate: 5e-5
- Epochs: 1
- LoRA Rank: 4

**Outputs:**
1. `CryptoYogi/vazhi-lora-v1` - LoRA adapter (~1MB)
2. `CryptoYogi/vazhi-v1` - Merged full model (~5GB)

**Next Steps:**
1. Convert merged model to GGUF Q4_K_M
2. Upload GGUF to `CryptoYogi/vazhi-gguf`
3. Update app to use new model

## Step 8: Merge Adapter and Prepare for GGUF

Merge the LoRA adapter with base model, then upload for GGUF conversion.

In [None]:
# Merge LoRA adapter with base model
print("Merging LoRA adapter with base model...")

# Merge and unload
merged_model = model.merge_and_unload()

# Save merged model
merged_path = "./vazhi-merged"
merged_model.save_pretrained(merged_path, safe_serialization=True)
tokenizer.save_pretrained(merged_path)

print(f"✅ Merged model saved to: {merged_path}")

# Show size
import os
total_size = 0
for f in os.listdir(merged_path):
    fpath = os.path.join(merged_path, f)
    if os.path.isfile(fpath):
        size = os.path.getsize(fpath) / 1e9
        total_size += size
        print(f"  {f}: {size:.2f} GB")
print(f"\nTotal merged model size: {total_size:.2f} GB")

In [None]:
# Upload merged model to HuggingFace
MERGED_REPO = "CryptoYogi/vazhi-v1"

api.create_repo(repo_id=MERGED_REPO, exist_ok=True, private=False)

print(f"Uploading merged model to {MERGED_REPO}...")
print("This may take 10-15 minutes for ~5GB...")

api.upload_folder(
    folder_path=merged_path,
    repo_id=MERGED_REPO,
    commit_message="VAZHI v1.0 - Full merged model trained on 11K Tamil samples"
)

print(f"\n✅ Merged model uploaded!")
print(f"View at: https://huggingface.co/{MERGED_REPO}")
print(f"\nNext step: Convert to GGUF using llama.cpp or online converter")

## Notes

**Key Differences from Failed Runs:**

| Setting | Failed Run | This Run |
|---------|-----------|----------|
| Tokenizer | Modified (pad=eos) | UNMODIFIED |
| Learning Rate | 5e-5 | 1e-6 (50x lower) |
| LoRA Rank | 8 | 4 |
| Target Modules | q,k,v,o | q,v only |
| Warmup | 10% | 30% |
| Samples | 50 | 500 |
| Evaluation | None | Every 25 steps |
| Gradient Clipping | None | 0.3 |

**If This Still Fails:**
1. Try learning rate 5e-7 or 1e-7
2. Use the base Gemma model instead of instruction-tuned
3. Consider using QLoRA with different quantization
4. Look into using PEFT with different adapters (IA3, etc.)