# 💎 Diamond PPO Demo

This notebook demonstrates the core features of Diamond PPO, a lightweight PyTorch implementation of Proximal Policy Optimisation.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/auxeno/diamond-ppo/blob/main/notebooks/diamond-ppo-demo.ipynb)

## Installation

First, let's install Diamond PPO and its dependencies:

In [None]:
# Install Diamond PPO from GitHub
!pip install -q git+https://github.com/auxeno/diamond-ppo

import gymnasium as gym
import numpy as np
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"Gymnasium version: {gym.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Basic Usage - Discrete Actions

Let's start with a simple example using CartPole, a classic control task with discrete actions:

In [None]:
from diamond import PPO, PPOConfig

# Create a simple PPO agent with custom configuration
config = PPOConfig(
    total_steps=50_000,  # Total training steps
    rollout_steps=128,   # Steps per rollout
    num_envs=4,          # Parallel environments
    lr=3e-4,             # Learning rate
    gamma=0.99,          # Discount factor
    gae_lambda=0.95,     # GAE lambda
    ppo_clip=0.2,        # PPO clipping parameter
    verbose=True         # Print training progress
)

# Create and train the agent
agent = PPO(
    env_fn=lambda: gym.make("CartPole-v1"),
    cfg=config
)

print("Training PPO on CartPole...")
agent.train()
print("\nTraining complete!")

## 2. Continuous Control

For continuous action spaces, use `ContinuousPPO`:

In [None]:
from diamond import ContinuousPPO, ContinuousPPOConfig

# Configure continuous PPO
config = ContinuousPPOConfig(
    total_steps=100_000,
    rollout_steps=256,
    num_envs=4,
    lr=3e-4,
    verbose=True
)

# Train on a continuous control task
agent = ContinuousPPO(
    env_fn=lambda: gym.make("Pendulum-v1"),
    cfg=config
)

print("Training Continuous PPO on Pendulum...")
agent.train()
print("\nTraining complete!")

## 3. Custom Neural Networks

Diamond PPO supports custom network architectures. Networks receive the observation space, action space, and config:

In [None]:
import torch.nn as nn
from dataclasses import dataclass

# Example 1: Simple custom network
class SimpleCustomNetwork(nn.Module):
    def __init__(self, observation_space, action_space, cfg):
        super().__init__()
        obs_dim = int(np.prod(observation_space.shape))
        act_dim = int(action_space.n)
        hidden_dim = cfg.network_hidden_dim  # Use config values
        
        # Shared feature extractor
        self.base = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        
        # Separate heads
        self.actor_head = nn.Linear(hidden_dim, act_dim)
        self.critic_head = nn.Linear(hidden_dim, 1)
    
    def get_actions(self, observations, device):
        """Returns actions for given observations"""
        x = torch.as_tensor(observations, dtype=torch.float32, device=device)
        with torch.inference_mode():
            features = self.base(x)
            logits = self.actor_head(features)
        return torch.distributions.Categorical(logits=logits).sample().cpu().numpy()
    
    def get_values(self, observations):
        """Returns value estimates"""
        with torch.inference_mode():
            features = self.base(observations)
            return self.critic_head(features).squeeze(-1)
    
    def get_logits_and_values(self, observations):
        """Returns both logits and values"""
        features = self.base(observations)
        logits = self.actor_head(features)
        values = self.critic_head(features).squeeze(-1)
        return logits, values

# Use the simple custom network
config = PPOConfig(
    total_steps=30_000,
    rollout_steps=128,
    num_envs=4,
    network_hidden_dim=128,  # This will be passed to the network
    verbose=True
)

agent = PPO(
    env_fn=lambda: gym.make("CartPole-v1"),
    cfg=config,
    network_cls=SimpleCustomNetwork
)

print("Training with custom network...")
agent.train()
print("\nTraining complete!")

In [None]:
# Example 2: Custom config with additional parameters
@dataclass
class CustomPPOConfig(PPOConfig):
    # Inherit all base PPO config fields
    # Add custom fields for our network
    num_layers: int = 3
    use_dropout: bool = True
    dropout_rate: float = 0.1
    activation: str = "relu"

class CustomNetwork(nn.Module):
    def __init__(self, observation_space, action_space, cfg):
        super().__init__()
        obs_dim = int(np.prod(observation_space.shape))
        act_dim = int(action_space.n)
        
        # Use custom config fields
        hidden_dim = cfg.network_hidden_dim
        num_layers = cfg.num_layers
        dropout_rate = cfg.dropout_rate if cfg.use_dropout else 0.0
        
        # Build network based on config
        activation = nn.ReLU() if cfg.activation == "relu" else nn.Tanh()
        
        layers = []
        for i in range(num_layers):
            in_dim = obs_dim if i == 0 else hidden_dim
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(activation)
            if cfg.use_dropout:
                layers.append(nn.Dropout(dropout_rate))
        
        self.base = nn.Sequential(*layers)
        self.actor_head = nn.Linear(hidden_dim, act_dim)
        self.critic_head = nn.Linear(hidden_dim, 1)

    def get_actions(self, observations, device):
        """Returns actions for given observations"""
        x = torch.as_tensor(observations, dtype=torch.float32, device=device)
        with torch.inference_mode():
            features = self.base(x)
            logits = self.actor_head(features)
        return torch.distributions.Categorical(logits=logits).sample().cpu().numpy()
    
    def get_values(self, observations):
        """Returns value estimates"""
        with torch.inference_mode():
            features = self.base(observations)
            return self.critic_head(features).squeeze(-1)
    
    def get_logits_and_values(self, observations):
        """Returns both logits and values"""
        features = self.base(observations)
        logits = self.actor_head(features)
        values = self.critic_head(features).squeeze(-1)
        return logits, values

# Create custom config with our parameters
custom_config = CustomPPOConfig(
    total_steps=30_000,
    rollout_steps=128,
    num_envs=4,
    network_hidden_dim=256,
    # Custom parameters
    num_layers=4,
    use_dropout=True,
    dropout_rate=0.2,
    activation="relu",
    verbose=True
)

# Create agent with custom config and network
agent = PPO(
    env_fn=lambda: gym.make("CartPole-v1"),
    cfg=custom_config,
    network_cls=CustomNetwork
)

print(f"Training with {custom_config.num_layers}-layer network, dropout={custom_config.use_dropout}")
agent.train()
print("\nTraining complete!")

### Advanced: Custom Config with Additional Parameters

You can extend the config class to add custom parameters for your network:

## 4. Training Utilities

Diamond PPO includes helpful utilities for monitoring training:

In [None]:
from diamond.utils import Logger

# Example of using the Logger
logger = Logger()

# Simulate some training metrics
np.random.seed(42)
for step in range(100):
    # Simulated metrics
    reward = 100 + step * 2 + np.random.randn() * 10
    loss = 1.0 / (1 + step * 0.1) + np.random.randn() * 0.01
    
    logger.log("episode_reward", step, reward)
    logger.log("policy_loss", step, loss)

# Plot the logged metrics
print("Episode Rewards:")
logger.plot("episode_reward")
print("\nPolicy Loss:")
logger.plot("policy_loss")

In [None]:
# Example of using the Timer for profiling
from diamond.utils import Timer
import time

timer = Timer()

# Simulate different parts of a training loop
for i in range(5):
    with timer.time("environment_step"):
        time.sleep(0.01)  # Simulate env.step()
    
    with timer.time("network_forward"):
        time.sleep(0.005)  # Simulate network forward pass
    
    with timer.time("optimisation"):
        time.sleep(0.008)  # Simulate optimisation step

# Display timing statistics
timer.plot_timings()

## 5. Evaluation and Visualisation

Let's evaluate a trained agent and visualise its performance:

In [None]:
def evaluate_agent(agent, env_name, num_episodes=5, render=False):
    """Evaluate a trained agent."""
    env = gym.make(env_name, render_mode="rgb_array" if render else None)
    
    episode_rewards = []
    frames = []
    
    for episode in range(num_episodes):
        obs, _ = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            # Get action from trained network
            with torch.no_grad():
                obs_tensor = torch.as_tensor(obs, dtype=torch.float32).unsqueeze(0)
                if hasattr(agent, "network"):
                    logits, _ = agent.network(obs_tensor)
                    action = torch.argmax(logits, dim=-1).item()
                else:
                    # For demonstration, use random actions
                    action = env.action_space.sample()
            
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += float(reward)
            
            if render and episode == 0:  # Only record first episode
                frames.append(env.render())
        
        episode_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward:.2f}")
    
    env.close()
    
    print(f"\nAverage Reward: {np.mean(episode_rewards):.2f} ± {np.std(episode_rewards):.2f}")
    
    return episode_rewards, frames

# Quick evaluation (using random policy for demo)
print("Evaluating agent on CartPole...")
rewards, _ = evaluate_agent(None, "CartPole-v1", num_episodes=5, render=False)