# Imports

In [1]:
import slimevolleygym
from Models.PPO.PPO_Agent import PPO_Agent
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from slimevolleygym import BaselinePolicy
import numpy as np
from utils import convert_to_vector
import types

  from .autonotebook import tqdm as notebook_tqdm


# Setup environment

In [2]:
env = slimevolleygym.SlimeVolleyEnv()
print(f"Action space: {env.action_space.n}")
print(f"Observation space: {env.observation_space.shape}")
env.close()

Action space: 3
Observation space: (12,)


# Device

In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Hyperparameters setup

In [4]:
# Hyperparameters
timesteps_per_batch = 4096                 # Number of timesteps to run per batch
max_timesteps_per_episode = 1600           # Max number of timesteps per episode
n_updates_per_iteration = 10               # Number of times to update actor/critic per iteration
lr_choices = [0.0003, 0.005]               # Learning rate of both actor and critic optimizers
gamma = 0.99                               # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
lam = 0.95                                 # Lambda Parameter for GAE 
num_minibatches = 6                        # Number of mini-batches for Mini-batch Update
ent_coef = 0                               # Entropy coefficient for Entropy Regularization
target_kl = 0.02                           # KL Divergence threshold
max_grad_norm = 0.5                        # Gradient Clipping threshold
render = False                             # Whether to render the environment

# Custom parameters
seed = 42
max_num_steps = 20000000
num_test_runs = 10
num_iterations_before_test_runs = 50
threshold_test_return_to_update_opponents = 1 # Winning 1 out of 5 games consistently means we can upgrade the opponent
num_iterations_before_save = 100

In [5]:
# If agent2 = None, then the agent is playing against the baseline policy
def evaluate(env, agent1, agent2, num_eval_episodes):

    # Set the model in evaluation mode
    agent1.evaluation_mode()
    
    # Run num_eval_episodes episodes and calculate the total return
    total_return = 0
    for _ in range(num_eval_episodes):

        state1 = env.reset()
        state2 = state1
        done = False
        while not done:
            
            with torch.no_grad():

                # Select the actions for each agent
                action1, _ = agent1.select_action(state1, greedy=True)
                action2, _ = agent2.select_action(state2)
            
            # Step the environment forward
            next_state1, reward, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
            next_state2 = info['otherObs']
            
            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Update the states
            state1 = next_state1
            state2 = next_state2
    
    # Set the model back in training mode
    agent1.training_mode()

    # Return the average return
    return total_return / num_eval_episodes

In [6]:
def train(timesteps_per_batch, max_timesteps_per_episode, n_updates_per_iteration, lr, gamma, clip, lam, \
          num_minibatches, ent_coef, target_kl, max_grad_norm, seed, max_num_steps, num_test_runs, \
            num_iterations_before_test_runs, num_iterations_before_save,\
                writer, logging_dir, render):

    # Create the environment
    env = slimevolleygym.SlimeVolleyEnv()
    torch.manual_seed(seed)
    env.seed(seed)

    # Create the player agents (6 possible actions cause we don't consider pressing forward and backward at the same time)
    agent1 = PPO_Agent(obs_dim=12, 
                        act_dim=6,
                        DEVICE=DEVICE,
                        timesteps_per_batch=timesteps_per_batch,
                        max_timesteps_per_episode=max_timesteps_per_episode,
                        n_updates_per_iteration=n_updates_per_iteration,
                        lr=lr,
                        gamma=gamma,
                        clip=clip,
                        lam=lam,
                        num_minibatches=num_minibatches,
                        ent_coef=ent_coef,
                        target_kl=target_kl,
                        max_grad_norm=max_grad_norm,
                        render=render)
    
    # Create the baseline policy
    agent2 = BaselinePolicy()
    def select_action(self, state, greedy=False):
        return self.predict(state), None
    agent2.select_action = types.MethodType(select_action, agent2)

    # Store the best test return
    n_steps = 0
    i = 0 # Episode number
    writer.flush()

    # Train the agent
    while n_steps < max_num_steps:

        # Print the progress
        print(flush=True)
        print(f"Training step {n_steps}/{max_num_steps}")
        print(flush=True)

        # Check if it's time to save the models
        if i > 0 and (i+1) % num_iterations_before_save == 0:
            agent1.save_models(logging_dir, 1, n_steps+1)
        
        if i % num_iterations_before_test_runs == 0:

            # Evaluate the agent against the baseline policy
            average_test_return_baseline = evaluate(env, agent1, agent2, num_test_runs)
            writer.add_scalar("Average baseline test return - Training step", average_test_return_baseline, n_steps)

            # Flush both results
            writer.flush()

        # Gather a batch of experiences
        batch_obs, batch_acts, batch_log_probs, batch_rews, batch_lens, batch_vals, batch_dones = agent1.gather_data(env, agent2)

        # Increment the number of steps
        n_steps += sum(batch_lens)

        # Run a training iteration on that batch
        agent1.learn(batch_acts=batch_acts,
                     batch_obs=batch_obs,
                     batch_log_probs=batch_log_probs,
                     batch_rews=batch_rews,
                     batch_vals=batch_vals,
                     batch_dones=batch_dones,
                     n_steps_so_far=n_steps,
                     total_n_steps=max_num_steps,
                     writer=writer)  

        # Log metrics
        writer.add_scalar("Average episode length - Training step", np.mean(batch_lens), n_steps)
        writer.add_scalar("Average self-play train return - Training step", np.mean([np.sum(ep_rews) for ep_rews in batch_rews]), n_steps)
        writer.flush()

        # Increment the iteration number
        i += 1
    
    # Save the final version of the models
    agent1.save_models(logging_dir, 1, n_steps+1)

In [7]:
i = 0 # Keep track of the hyperparameter combinations

for lr in lr_choices:

    # Print the hyperparameters
    print(f"{i}) Alpha: {lr}, Beta: {ent_coef}")

    # Create the writer
    logging_dir = f"./Logging/PPO-BASELINE/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{lr}-entcoef-{ent_coef}"
    writer = SummaryWriter(logging_dir)

    # Train the agent
    train(timesteps_per_batch, max_timesteps_per_episode, n_updates_per_iteration, lr, gamma, \
            clip, lam, num_minibatches, ent_coef, target_kl, max_grad_norm, seed, max_num_steps, \
            num_test_runs, num_iterations_before_test_runs, num_iterations_before_save, writer, \
                logging_dir, render)
    
    # Close the writer
    writer.close()

0) Alpha: 0.0003, Beta: 0



Training step 0/20000000



  obs = torch.tensor(obs,dtype=torch.float).to(self.DEVICE)
  batch_obs = torch.tensor(batch_obs, dtype=torch.float).to(self.DEVICE)



Training step 4572/20000000


Training step 8754/20000000


Training step 13381/20000000


Training step 17693/20000000



  return F.mse_loss(input, target, reduction=self.reduction)



Training step 22021/20000000


Training step 26430/20000000


Training step 30675/20000000


Training step 35001/20000000


Training step 39658/20000000


Training step 43775/20000000


Training step 48148/20000000


Training step 52621/20000000


Training step 56979/20000000


Training step 61214/20000000


Training step 65637/20000000


Training step 69902/20000000


Training step 74270/20000000


Training step 78769/20000000


Training step 83074/20000000

