# GENESIS Level 0 Training v2 - Target 98%+

**Improvements over v1:**
- 100 epochs (vs 50)
- LoRA r=32 (vs 16) - more capacity
- Warmup + cosine decay LR schedule
- Early stopping
- Data augmentation (case variation)

1. **Runtime â†’ Change runtime type â†’ GPU (T4)**
2. Run all cells in order

In [None]:
# Step 1: Setup environment
import subprocess
import os
import sys

# Clone fresh
if os.path.exists('genesis'):
    subprocess.run(['rm', '-rf', 'genesis'], check=True)
subprocess.run(['git', 'clone', 'https://github.com/0xMayoor/genesis.git'], check=True)
os.chdir('genesis')

# Install
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'torch', 'transformers', 'peft', 'accelerate', 'capstone', 'hypothesis'], check=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.', '--force-reinstall', '--no-deps', '-q'], check=True)

# Add to path
sys.path.insert(0, os.getcwd())

print(f"âœ… Setup complete. Working dir: {os.getcwd()}")

In [None]:
# Step 2: Check GPU
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Step 3: Train with improved settings
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader, Dataset
import json
import random
import math

# Config - optimized for 98%+ accuracy
MODEL_NAME = "distilgpt2"
BATCH_SIZE = 16          # Larger batch for stability
EPOCHS = 100             # More epochs
LR = 3e-5                # Lower peak LR
WARMUP_EPOCHS = 5        # Warmup period
MAX_LENGTH = 128         # Shorter sequences (our data is simple)
DATASET_PATH = Path("genesis_datasets/level0/train.jsonl")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Load tokenizer and model
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=False)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, token=False)

# Apply LoRA with higher capacity
print("Applying LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,            # Higher rank for more capacity
    lora_alpha=64,   # 2x rank
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"],
)
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()
model.print_trainable_parameters()
model.to(device)

# Dataset with augmentation
class Level0Dataset(Dataset):
    def __init__(self, path, tokenizer, max_length):
        self.samples = []
        with open(path) as f:
            for line in f:
                self.samples.append(json.loads(line))
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Count unique mnemonics
        mnemonics = set(s.get("expected_mnemonic", "") for s in self.samples)
        print(f"Unique mnemonics: {len(mnemonics)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        hex_bytes = sample.get("raw_bytes", "")
        expected = sample.get("expected_mnemonic", "unknown")
        
        # Randomly vary case for robustness
        if random.random() < 0.3:
            hex_bytes = hex_bytes.upper()
        
        text = f"Disassemble: {hex_bytes}\nOutput: {expected}"

        encoded = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": encoded["input_ids"].squeeze(),
        }

print(f"Loading dataset from {DATASET_PATH}...")
dataset = Level0Dataset(DATASET_PATH, tokenizer, MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print(f"Dataset: {len(dataset)} samples, {len(dataloader)} batches")

# Optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

# Warmup + Cosine decay scheduler
def lr_lambda(epoch):
    if epoch < WARMUP_EPOCHS:
        return (epoch + 1) / WARMUP_EPOCHS
    else:
        progress = (epoch - WARMUP_EPOCHS) / (EPOCHS - WARMUP_EPOCHS)
        return 0.5 * (1 + math.cos(progress * math.pi))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
model.train()

print(f"\nStarting training for {EPOCHS} epochs...")
print(f"Warmup: {WARMUP_EPOCHS} epochs, then cosine decay")
print(f"Estimated time: ~2-3 hours on T4")
print("="*50)

best_loss = float('inf')
patience = 0
max_patience = 15  # Early stopping patience

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(dataloader)

    # Print every 10 epochs
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | LR: {scheduler.get_last_lr()[0]:.2e}")

    # Early stopping check
    if avg_loss < best_loss - 0.001:
        best_loss = avg_loss
        patience = 0
    else:
        patience += 1
        if patience >= max_patience and epoch > 50:
            print(f"Early stopping at epoch {epoch+1}")
            break

print("="*50)
print(f"ðŸŽ‰ Training Complete! Best Loss: {best_loss:.4f}")

In [None]:
# Step 4: Quick test
model.eval()
test_cases = [
    ("90", "nop"),
    ("c3", "ret"),
    ("cc", "int3"),
    ("55", "push"),
    ("5d", "pop"),
    ("c9", "leave"),
    ("f4", "hlt"),
    ("50", "push"),
]

print("Quick inference test:")
correct = 0
for hex_bytes, expected in test_cases:
    prompt = f"Disassemble: {hex_bytes}\nOutput:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    result = response.split("Output:")[-1].strip().split()[0] if "Output:" in response else "?"
    match = "âœ“" if result.lower() == expected.lower() else "âœ—"
    if result.lower() == expected.lower():
        correct += 1
    print(f"  {hex_bytes} -> {result} (expected: {expected}) {match}")
print(f"\nQuick test accuracy: {correct}/{len(test_cases)} = {100*correct/len(test_cases):.0f}%")

In [None]:
# Step 5: Save model
save_path = Path("models/level0")
save_path.mkdir(parents=True, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"âœ… Model saved to {save_path}")

In [None]:
# Step 6: Download
import subprocess
subprocess.run(['zip', '-r', 'level0_model_v2.zip', 'models/level0/'], check=True)
from google.colab import files
files.download('level0_model_v2.zip')
print("ðŸ“¦ Download started!")