# DPO Fine-tuning Qwen3 8B for Code Vulnerability Detection with Unsloth

This notebook implements DPO fine-tuning for Qwen3 8B (or 4B) on the `CyberNative/Code_Vulnerability_Security_DPO` dataset using Unsloth for 2-5x speedup.
The goal is to align the model with preferences for secure code generation.

**Key improvements:**
- Uses Qwen3 4B
- Leverages Unsloth for significant training speedup
- Optimized for T4 GPU x 2 setup (Kaggle)

In [None]:
# Install required packages with Unsloth
# !pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install -q transformers datasets accelerate bitsandbytes peft trl sentencepiece evaluate

print("Installation complete")

In [None]:
# WandB Authentication and Setup
import wandb
import os

# Get WandB API key from Kaggle secrets
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")
    os.environ["WANDB_API_KEY"] = wandb_api_key
    print("✅ WandB API key loaded from Kaggle secrets")
except Exception as e:
    print(f"⚠️ Could not load WandB API key from secrets: {e}")
    print("Please add your WandB API key to Kaggle secrets with key name 'WANDB_API_KEY'")

# Login to WandB
try:
    wandb.login(key=wandb_api_key)
    print("Successfully logged in to WandB")
    
    # Get the actual username after login
    wandb_user = wandb.api.default_entity
    print(f"WandB username: {wandb_user}")
    
except Exception as e:
    print(f"WandB login failed: {e}")

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
import torch

# Patch DPO Trainer for Unsloth compatibility
PatchDPOTrainer()

from datasets import load_dataset
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig

# Help avoid memory fragmentation!!!
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"PyTorch version: {torch.__version__}")

print("✅ All imports successful with Unsloth optimizations enabled")

In [None]:
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU Memory:")
    print(f"   Total: {total:.2f} GB")
    print(f"   Allocated: {allocated:.2f} GB") 
    print(f"   Reserved: {reserved:.2f} GB")
    print(f"   Free: {total - reserved:.2f} GB")

## 1. Data Acquisition and Preparation

We will load the `CyberNative/Code_Vulnerability_Security_DPO` dataset and prepare it for DPO training.

In [None]:
dataset_name = "CyberNative/Code_Vulnerability_Security_DPO"

# Load dataset
print(f"Loading dataset: {dataset_name}")
dataset = load_dataset(dataset_name)
print("Dataset loaded.")

# Rename 'question' to 'prompt' if needed
if 'question' in dataset['train'].column_names:
    dataset = dataset.rename_column("question", "prompt")
    print("Renamed 'question' column to 'prompt'.")

# Split into training and evaluation sets (if not already split)
if 'test' not in dataset:
    print("Splitting dataset into train and test (90/10)...")
    dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    print("Dataset split into train and test sets.")

print("Dataset sizes:")
for split_name, split_data in dataset.items():
    print(f"- {split_name}: {len(split_data)}")

print("\nSample datapoint:")
print(dataset['train'][0])

## 2. Model Selection and Loading with Unsloth

We'll use Qwen3 8B with Unsloth optimizations. If memory is insufficient, we'll fall back to Qwen3 4B.

In [None]:
# Model configuration
max_seq_length = 2048  # Qwen3 supports longer contexts
dtype = None  # Auto-detect (bfloat16 for modern GPUs, float16 for older)
load_in_4bit = True  # Use 4-bit quantization for memory efficiency

# Try 8B first, fall back to 4B if needed
model_options = [
    # "unsloth/Qwen3-8B-unsloth-bnb-4bit",  # 8B model
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",  # 4B fallback
]

model = None
tokenizer = None
selected_model = None

for model_name in model_options:
    try:
        print(f"Attempting to load: {model_name}")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            # token="hf_...",  # Use if needed for gated models
        )
        selected_model = model_name
        print(f"✅ Successfully loaded: {model_name}")
        break
    except Exception as e:
        print(f"❌ Failed to load {model_name}: {e}")
        continue

if model is None:
    raise RuntimeError("Failed to load any model. Please check your setup.")

print(f"\nUsing model: {selected_model}")
print(f"Max sequence length: {max_seq_length}")
print(f"Data type: {dtype or 'auto-detected'}")
print(f"4-bit quantization: {load_in_4bit}")

In [None]:
# Add LoRA adapters with Unsloth optimizations
print("Adding LoRA adapters with Unsloth optimizations...")

model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # LoRA rank - higher for better performance
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",  # Include MLP layers for Qwen3
    ],
    lora_alpha=32,  # LoRA scaling factor
    lora_dropout=0,  # Unsloth optimized for 0 dropout
    bias="none",  # Unsloth optimized for no bias
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized checkpointing
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoftQ config
)

print("LoRA adapters added successfully")
print(f"LoRA rank: 32")
print(f"Target modules: {len(['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])} modules")

In [None]:
# Training configuration
output_dir = "./dpo_qwen3_security_finetuned"
os.makedirs(output_dir, exist_ok=True)

# CRITICAL: Much more conservative sequence lengths for DPO
max_prompt_length = 256  # REDUCED from 512 - DPO needs shorter prompts
max_length = 1024  # REDUCED from 1792 - DPO is memory intensive

# Training hyperparameters optimized for T4 + DPO
num_train_epochs = 3
per_device_train_batch_size = 1  # Keep at 1 for DPO
gradient_accumulation_steps = 4  # REDUCED from 8 for memory
learning_rate = 5e-6  # Lower LR for DPO stability
beta = 0.1  # DPO beta parameter

# Initialize WandB run with proper error handling
try:
    # Get the current WandB user (if logged in)
    try:
        wandb_entity = wandb.api.default_entity
    except:
        wandb_entity = None  # Let WandB use default
    
    wandb.init(
        project="llm-guard-qwen3-dpo-training",
        entity=wandb_entity,  # Use detected entity or None for default
        name=f"qwen3-{selected_model.split('/')[-1]}-security-dpo-v2-conservative",
        config={
            "model_name": selected_model,
            "dataset": "CyberNative/Code_Vulnerability_Security_DPO",
            "method": "DPO",
            "framework": "Unsloth",
            "quantization": "4-bit",
            "lora_rank": 32,
            "lora_alpha": 32,
            "batch_size": per_device_train_batch_size,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "effective_batch_size": per_device_train_batch_size * gradient_accumulation_steps,
            "learning_rate": learning_rate,
            "num_epochs": num_train_epochs,
            "max_prompt_length": max_prompt_length,
            "max_length": max_length,
            "beta": beta,
            "gpu": "T4",
            "platform": "Kaggle",
            "memory_optimization": "conservative"
        },
        tags=["dpo", "qwen3", "security", "code-vulnerability", "unsloth", "conservative"]
    )
    
    print(f"WandB run initialized: {wandb.run.name}")
    print(f"Dashboard URL: {wandb.run.url}")
    wandb_enabled = True
    
except Exception as e:
    print(f"WandB initialization failed: {e}")
    print("Continuing without WandB logging...")
    wandb_enabled = False
    
    # Set environment variable to disable WandB for training
    os.environ["WANDB_MODE"] = "disabled"

In [None]:
# DPO Training configuration with ULTRA-CONSERVATIVE memory settings
print("Initializing DPO Trainer with ultra-conservative memory settings...")

# Use DPOConfig for newer TRL versions
training_args = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=1,  # Keep at 1
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    
    # CRITICAL: DISABLE EVALUATION to prevent memory spikes
    eval_strategy="no",  # CHANGED from "steps" - no evaluation during training
    logging_strategy="steps",
    logging_steps=50,       # INCREASED to reduce overhead
    save_strategy="epoch",
    save_total_limit=1,     # Keep only 1 checkpoint
    
    # PRECISION & OPTIMIZATION
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    gradient_checkpointing=True,  # Essential for memory
    gradient_checkpointing_kwargs = {"use_reentrant": False},  # More memory efficient
    
    # AGGRESSIVE MEMORY OPTIMIZATIONS
    remove_unused_columns=False,
    report_to="wandb" if wandb_enabled else "none",
    dataloader_num_workers=0,     # No multiprocessing
    dataloader_pin_memory=False,  # Reduce memory pressure
    dataloader_persistent_workers=False,  # ADDED
    
    # OPTIMIZER SETTINGS
    warmup_ratio=0.03,      # REDUCED warmup
    optim="adamw_8bit",     # 8-bit optimizer
    weight_decay=0.0,
    lr_scheduler_type="linear",
    seed=42,
    
    # DPO-specific parameters - CONSERVATIVE
    beta=beta,
    max_prompt_length=max_prompt_length,
    max_length=max_length,
    loss_type="sigmoid",
    
    # ADDITIONAL MEMORY OPTIMIZATIONS
    prediction_loss_only=True,           # Reduce memory for metrics
    include_inputs_for_metrics=False,    # Save memory
    disable_tqdm=False,                  # Keep progress bars
    
    # FORCE SMALLER BATCHES
    max_steps=-1,  # Use epochs instead of steps
    
    # MEMORY CLEANUP
    save_safetensors=True,
    load_best_model_at_end=False,  # ADDED - don't load best model
)

print(f"Conservative settings applied:")
print(f"   • Max length: {max_length} (reduced from 1792)")
print(f"   • Max prompt length: {max_prompt_length} (reduced from 512)")
print(f"   • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
print(f"   • Evaluation: DISABLED (prevents memory spikes)")
print(f"   • Expected memory usage: ~8-10GB peak")

# Initialize DPO Trainer
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # Unsloth handles reference model internally
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=None,  # DISABLED evaluation dataset
    tokenizer=tokenizer,
)

print("DPO Trainer initialized with ultra-conservative memory settings!")
print(f"Ready for memory-efficient training on T4 GPU")

In [None]:
# Memory monitoring before training
def print_memory_stats(stage=""):
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        free = total - reserved
        
        print(f"GPU Memory {stage}:")
        print(f"   Total: {total:.2f} GB")
        print(f"   Allocated: {allocated:.2f} GB") 
        print(f"   Reserved: {reserved:.2f} GB")
        print(f"   Free: {free:.2f} GB")
        
        if free < 4.0:
            print("CRITICAL: Very low memory!")
        elif free < 6.0:
            print("WARNING: Low memory")
        else:
            print("GOOD: Sufficient memory")
        
        return free
    return 0

# Check memory before training
free_memory = print_memory_stats("before training")

In [None]:
# DPO Training configuration with Unsloth optimizations
print("Initializing DPO Trainer with Unsloth optimizations...")

# Use DPOConfig for newer TRL versions
training_args = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=1,  # REDUCED for eval memory
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    
    # MEMORY CRITICAL SETTINGS
    eval_strategy="steps",  # CHANGED from "epoch" to reduce memory spikes
    eval_steps=200,         # ADDED - less frequent evaluation
    logging_strategy="steps",
    logging_steps=25,       # INCREASED from 10 to reduce overhead
    save_strategy="epoch",
    save_total_limit=1,     # REDUCED from 2 to 1 (save memory)
    
    # PRECISION & OPTIMIZATION
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    gradient_checkpointing=True,  # ADDED - crucial for memory
    gradient_checkpointing_kwargs = {"use_reentrant": False},  # More memory efficient
    
    # MEMORY OPTIMIZATIONS
    remove_unused_columns=False,
    report_to="wandb",  # Conditional WandB
    dataloader_num_workers=0,     # REDUCED from 2 to 0 (prevents memory leaks)
    dataloader_pin_memory=False,  # ADDED - reduce memory pressure
    
    # OPTIMIZER SETTINGS
    warmup_ratio=0.05,      # REDUCED from 0.1 (fewer warmup steps)
    optim="adamw_8bit",
    weight_decay=0.0,
    lr_scheduler_type="linear",
    seed=42,
    
    # DPO-specific parameters
    beta=beta,
    max_prompt_length=max_prompt_length,
    max_length=max_length,
    loss_type="sigmoid",
    
    # ADDITIONAL MEMORY OPTIMIZATIONS
    prediction_loss_only=True,           # ADDED - reduce memory for metrics
    include_inputs_for_metrics=False,    # ADDED - save memory
    disable_tqdm=False,                  # Keep progress bars for monitoring
)

# Initialize DPO Trainer
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # Unsloth handles reference model internally
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

print("DPO Trainer initialized successfully with Unsloth optimizations!")

## 4. Training with Unsloth Acceleration

Start the DPO fine-tuning process with Unsloth's optimizations.

In [None]:
# Show memory stats before training
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved before training.")

print("\nStarting DPO training with Unsloth acceleration...")
print(f"Training on {len(dataset['train'])} samples")
print(f"Evaluating on {len(dataset['test'])} samples")

# Start training
training_results = dpo_trainer.train()

print("\nTraining completed!")
print("\nTraining metrics:")
print(training_results.metrics)

In [27]:
import os
import torch

# CRITICAL: Set before any CUDA operations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Also try these additional settings
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:16"

# Clear and restart
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [None]:
# Show final memory and time stats
if torch.cuda.is_available():
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_training = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    training_percentage = round(used_memory_for_training / max_memory * 100, 3)
    
    print(f"{training_results.metrics['train_runtime']} seconds used for training.")
    print(f"{round(training_results.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f" Peak reserved memory = {used_memory} GB.")
    print(f" Peak reserved memory for training = {used_memory_for_training} GB.")
    print(f" Peak reserved memory % of max memory = {used_percentage}%.")
    print(f" Peak reserved memory for training % of max memory = {training_percentage}%.")
    
    # Log to WandB
    wandb.log({
        "final_memory_usage_gb": used_memory,
        "training_memory_usage_gb": used_memory_for_training,
        "memory_usage_percentage": used_percentage,
        "training_time_minutes": round(training_results.metrics['train_runtime']/60, 2)
    })

## 5. Model Saving with Unsloth

Save the trained model using Unsloth's optimized saving methods.

In [None]:
# Save the model using Unsloth's methods
final_model_path = os.path.join(output_dir, "final_checkpoint")
print(f"Saving model to: {final_model_path}")

# Save LoRA adapters
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
print("LoRA adapters and tokenizer saved")

print(f"\n Model training and saving completed!")
print(f" Main model location: {final_model_path}")
print(f" WandB dashboard: {wandb.run.url}")