# Phase 5: GPU Training with Self-Play

This notebook enables GPU-accelerated training on JupyterHub servers.

## Features
- Automatic GPU detection (CUDA/MPS)
- Self-play training
- Large map support (500x500+)
- Continue training from Phase 3 models
- Real-time monitoring

## Prerequisites
- JupyterHub with GPU access
- CUDA 11.7+ (for NVIDIA GPUs)
- Python 3.8+
- All dependencies installed (see requirements.txt)

## 1. Setup and Imports

In [None]:
import os
import sys
import torch
from datetime import datetime

# Add src directory to path
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, os.path.join(os.getcwd(), '..', '..', 'phase3-implementation', 'src'))

print("✓ Imports successful")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")

## 2. Check GPU Availability

In [None]:
# Check CUDA availability
print("GPU Status:")
print("=" * 50)

if torch.cuda.is_available():
    print(f"✓ CUDA is available!")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA Version: {torch.version.cuda}")
    print(f"  Number of GPUs: {torch.cuda.device_count()}")
    
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"\n  GPU {i}: {props.name}")
        print(f"    Total Memory: {props.total_memory / 1e9:.2f} GB")
        print(f"    Compute Capability: {props.major}.{props.minor}")
    
    device = 'cuda'
elif torch.backends.mps.is_available():
    print(f"✓ Apple Metal (MPS) is available!")
    device = 'mps'
else:
    print(f"⚠️  No GPU detected - using CPU (training will be VERY slow)")
    device = 'cpu'

print("\n" + "=" * 50)
print(f"Selected device: {device.upper()}")

## 3. Configuration

Adjust these parameters for your training run.

In [None]:
# Training Configuration
CONFIG = {
    # Model
    'phase3_model_path': None,  # Set to path of Phase 3 model to continue training
    
    # Environment
    'map_name': 'australia_500x500',  # Map to train on
    'num_bots': 10,                   # Number of opponent bots
    'n_envs': 16,                     # Parallel environments (adjust based on GPU memory)
    
    # Training
    'total_timesteps': 1_000_000,     # Total training steps
    'learning_rate': 3e-4,            # Learning rate (use 1e-4 for fine-tuning)
    'batch_size': 512,                # Batch size (512 for CUDA, 256 for MPS/CPU)
    'n_steps': 2048,                  # Rollout steps (2048 for CUDA, 1024 for MPS/CPU)
    
    # Self-Play
    'use_self_play': True,            # Enable self-play training
    
    # Output
    'output_dir': f'../runs/run_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
}

# Adjust batch size and n_steps based on device
if device == 'mps' or device == 'cpu':
    CONFIG['batch_size'] = 256
    CONFIG['n_steps'] = 1024
    CONFIG['n_envs'] = 8  # Fewer environments for non-CUDA

print("Training Configuration:")
print("=" * 50)
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print("=" * 50)

## 4. Load Phase 3 Model (Optional)

If you want to continue training from a Phase 3 model, set the path in the configuration above.

In [None]:
from stable_baselines3 import PPO

model = None

if CONFIG['phase3_model_path']:
    print(f"Loading Phase 3 model from: {CONFIG['phase3_model_path']}")
    try:
        model = PPO.load(CONFIG['phase3_model_path'], device=device)
        print("✓ Model loaded successfully")
        
        # Print model info
        print(f"  Policy: {type(model.policy).__name__}")
        print(f"  Device: {model.device}")
        
        # Use lower learning rate for fine-tuning
        CONFIG['learning_rate'] = 1e-4
        print(f"  Adjusted learning rate to {CONFIG['learning_rate']} for fine-tuning")
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        print("  Starting from scratch instead")
        model = None
else:
    print("No Phase 3 model specified - starting from scratch")

## 5. Create Training Environment

In [None]:
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.monitor import Monitor
from environment_large import OpenFrontEnvLarge
from self_play_env import SelfPlayEnv

def make_env(rank):
    """Create a single environment"""
    def _init():
        if CONFIG['use_self_play']:
            env = SelfPlayEnv(
                num_bots=CONFIG['num_bots'],
                map_name=CONFIG['map_name']
            )
        else:
            env = OpenFrontEnvLarge(
                num_bots=CONFIG['num_bots'],
                map_name=CONFIG['map_name']
            )
        env = Monitor(env)
        return env
    return _init

print(f"Creating {CONFIG['n_envs']} parallel environments...")
env = SubprocVecEnv([make_env(i) for i in range(CONFIG['n_envs'])])
print("✓ Environments created")

## 6. Create or Update Model

In [None]:
from model import BattleRoyaleExtractor

os.makedirs(CONFIG['output_dir'], exist_ok=True)
os.makedirs(os.path.join(CONFIG['output_dir'], 'checkpoints'), exist_ok=True)
os.makedirs(os.path.join(CONFIG['output_dir'], 'logs'), exist_ok=True)

if model is None:
    print("Creating new PPO model...")
    model = PPO(
        policy="MultiInputPolicy",
        env=env,
        learning_rate=CONFIG['learning_rate'],
        n_steps=CONFIG['n_steps'],
        batch_size=CONFIG['batch_size'],
        n_epochs=10,
        gamma=0.995,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.03,
        vf_coef=0.5,
        max_grad_norm=0.5,
        policy_kwargs={
            'features_extractor_class': BattleRoyaleExtractor,
            'features_extractor_kwargs': {'features_dim': 256}
        },
        verbose=1,
        device=device,
        tensorboard_log=os.path.join(CONFIG['output_dir'], 'logs')
    )
    print("✓ Model created")
else:
    print("Using loaded model, updating environment...")
    model.set_env(env)
    print("✓ Model updated")

# Print GPU memory usage
if device == 'cuda':
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    print(f"\nGPU Memory: Allocated={allocated:.2f}GB, Reserved={reserved:.2f}GB")

## 7. Setup Callbacks

In [None]:
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList
from training_callback import DetailedLoggingCallback

# Checkpoint callback - save every 50K steps
checkpoint_callback = CheckpointCallback(
    save_freq=50_000 // CONFIG['n_envs'],
    save_path=os.path.join(CONFIG['output_dir'], 'checkpoints'),
    name_prefix='phase5_model'
)

# Detailed logging
logging_callback = DetailedLoggingCallback(verbose=1)

callbacks = CallbackList([checkpoint_callback, logging_callback])

print("✓ Callbacks configured")
print(f"  Checkpoint frequency: {50_000 // CONFIG['n_envs']} steps per env")
print(f"  Checkpoint directory: {os.path.join(CONFIG['output_dir'], 'checkpoints')}")

## 8. Start Training

This cell will start the training process. It may take several hours depending on:
- Your GPU performance
- Number of timesteps
- Number of parallel environments

**Note:** You can interrupt training at any time (Kernel → Interrupt). The latest checkpoint will be saved.

In [None]:
print("=" * 70)
print("STARTING TRAINING")
print("=" * 70)
print(f"Device: {device.upper()}")
print(f"Map: {CONFIG['map_name']}")
print(f"Opponents: {CONFIG['num_bots']} bots")
print(f"Self-Play: {CONFIG['use_self_play']}")
print(f"Total Timesteps: {CONFIG['total_timesteps']:,}")
print(f"Parallel Environments: {CONFIG['n_envs']}")
print(f"Output: {CONFIG['output_dir']}")
print("=" * 70)
print("\nPress Kernel → Interrupt to stop training early.\n")

try:
    model.learn(
        total_timesteps=CONFIG['total_timesteps'],
        callback=callbacks,
        progress_bar=True,
        tb_log_name='phase5_training'
    )
    print("\n" + "=" * 70)
    print("✓ TRAINING COMPLETE!")
    print("=" * 70)
except KeyboardInterrupt:
    print("\n" + "=" * 70)
    print("⚠️  Training interrupted by user")
    print("=" * 70)
finally:
    # Show final GPU memory
    if device == 'cuda':
        allocated = torch.cuda.memory_allocated(0) / 1e9
        reserved = torch.cuda.memory_reserved(0) / 1e9
        print(f"\nFinal GPU Memory: Allocated={allocated:.2f}GB, Reserved={reserved:.2f}GB")

## 9. Save Final Model

In [None]:
final_path = os.path.join(CONFIG['output_dir'], 'phase5_final')
model.save(final_path)

print("=" * 70)
print(f"✓ Model saved to: {final_path}")
print("=" * 70)
print(f"\nCheckpoints available at:")
print(f"  {os.path.join(CONFIG['output_dir'], 'checkpoints')}")
print(f"\nTensorBoard logs available at:")
print(f"  {os.path.join(CONFIG['output_dir'], 'logs')}")
print(f"\nTo view logs, run:")
print(f"  tensorboard --logdir {os.path.join(CONFIG['output_dir'], 'logs')}")

## 10. Cleanup

In [None]:
# Close environments
env.close()
print("✓ Environments closed")

# Clear GPU cache if using CUDA
if device == 'cuda':
    torch.cuda.empty_cache()
    print("✓ CUDA cache cleared")

print("\n✓ All done!")

## 11. Monitor Training (Optional)

Run this cell in a separate terminal to monitor training progress:

```bash
# Start TensorBoard
tensorboard --logdir phase5-implementation/runs

# Then open: http://localhost:6006
```

## Tips for JupyterHub

1. **Keep Session Alive**: Make sure your JupyterHub session doesn't timeout during long training runs
2. **Monitor GPU Usage**: Use `nvidia-smi` in a terminal to monitor GPU usage
3. **Save Checkpoints**: Training automatically saves checkpoints every 50K steps
4. **Adjust Batch Size**: If you get OOM errors, reduce `batch_size` and `n_steps` in CONFIG
5. **Use TensorBoard**: Monitor training metrics in real-time with TensorBoard

### Recommended GPU Settings:

| GPU Memory | n_envs | batch_size | n_steps |
|------------|--------|------------|----------|
| 8 GB       | 8      | 256        | 1024    |
| 16 GB      | 16     | 512        | 2048    |
| 24 GB+     | 24     | 1024       | 4096    |