# Imports

In [1]:
import slimevolleygym
from Models.PPO.PPO_Agent import PPO_Agent
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from slimevolleygym import BaselinePolicy

  from .autonotebook import tqdm as notebook_tqdm


# Setup environment

In [2]:
env = slimevolleygym.SlimeVolleyEnv()
print(f"Action space: {env.action_space.n}")
print(f"Observation space: {env.observation_space.shape}")
env.close()

Action space: 3
Observation space: (12,)


# Device

In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Hyperparameters setup

In [4]:
seed = 42
batch_size = 64
n_epochs = 10
alpha_choices = [0.0003, 0.0005]
lamda = 0.95
gamma = 0.99
clip = 0.2 # From the paper
beta_choices = [0, 0.1] # Entropy regularization coefficient
horizon = 4096 # Number of steps before training the agent

max_num_episodes = 10000
num_test_runs = 10
num_episodes_before_test_runs = 50
threshold_test_return_to_update_opponents = 0.5 # Winning 0.5 out of 5 games consistently means we can upgrade the opponent
num_episodes_before_save = 1000

In [5]:
# If agent2 = None, then the agent is playing against the baseline policy
def evaluate(env, agent1, num_eval_episodes, agent2=None):

    # Set the model in evaluation mode
    agent1.actor.eval()

    # If agent2 is None, then we're playing against the baseline policy
    if agent2 is None:
        agent2 = BaselinePolicy()
    else:
        agent2.actor.eval()
    
    # Run num_eval_episodes episodes and calculate the total return
    total_return = 0
    for _ in range(num_eval_episodes):

        state1 = env.reset()
        state2 = state1
        done = False
        while not done:
            
            with torch.no_grad():

                # Select the actions for each agent
                action1, _, _ = agent1.select_action(torch.FloatTensor(state1).to(DEVICE), greedy=True)

                if agent2 is None:
                    action2 = agent2.predict(state2)
                else:
                    action2, _, _ = agent2.select_action(torch.FloatTensor(state2).to(DEVICE), greedy=True)
            
            # Step the environment forward
            next_state1, reward, done, info = env.step(action1, otherAction=action2)
            next_state2 = info['otherObs']
            
            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Update the states
            state1 = next_state1
            state2 = next_state2
    
    # Set the model back in training mode
    agent1.actor.train()

    if agent2 is not None:
        agent2.actor.train()

    # Return the average return
    return total_return / num_eval_episodes

In [6]:
def train(alpha, beta, lamb, gamma, horizon, batch_size, n_epochs, clip, max_num_episodes, num_test_runs, num_episodes_before_test_runs, threshold_test_return_to_update_opponents, writer, logging_dir):

    # Create the environment
    env = slimevolleygym.SlimeVolleyEnv()
    torch.manual_seed(seed)
    env.seed(seed)

    # Create the player agents
    agent1 = PPO_Agent(state_size=env.observation_space.shape,
                        action_size=env.action_space.n,
                        alpha = alpha,
                        beta = beta,
                        lamda = lamb,
                        gamma = gamma,
                        horizon = horizon,
                        batch_size = batch_size,
                        num_epoch = n_epochs,
                        clip = clip,
                        device=DEVICE)
    
    agent2 = PPO_Agent(state_size=env.observation_space.shape,
                        action_size=env.action_space.n,
                        alpha = alpha,
                        beta = beta,
                        lamda = lamb,
                        gamma = gamma,
                        horizon = horizon,
                        batch_size = batch_size,
                        num_epoch = n_epochs,
                        clip = clip,
                        device=DEVICE)

    # Store the best test return
    n_steps = 0
    e = 0 # Episode number
    generation_number = 0 # The number of times we copied the agent into the opponent
    writer.add_scalar("TrainStep-GenerationNumber", 0, 0) # Log the number of steps it took for each generation
    writer.flush()

    # Train the agent
    for e in tqdm(range(max_num_episodes)):

        # Check if it's time to save the models
        if e > 0 and (e+1) % num_episodes_before_save == 0:
            agent1.save_models(logging_dir, 1, e+1)
            agent2.save_models(logging_dir, 2, e+1)
        
        if e % num_episodes_before_test_runs == 0:
            
            # Evaluate the 2 agents against each other
            average_test_return = evaluate(env, agent1, num_test_runs, agent2=agent2)
            writer.add_scalar("Average self-play test return - Training step", average_test_return, n_steps)
            writer.add_scalar("Average self-play test return - Training episode", average_test_return, e)

            # Evaluate the agent against the baseline policy
            average_test_return_baseline = evaluate(env, agent1, num_test_runs)
            writer.add_scalar("Average baseline test return - Training step", average_test_return_baseline, n_steps)
            writer.add_scalar("Average baseline test return - Training episode", average_test_return_baseline, e)

            # Flush both results
            writer.flush()

            # Check if the average test return is above the threshold
            if average_test_return > threshold_test_return_to_update_opponents:

                # Copy the player agent into the opponent agent
                agent2.actor.load_state_dict(agent1.actor.state_dict())
                agent2.critic.load_state_dict(agent1.critic.state_dict())

                # Increment the generation number
                generation_number += 1

                # Store the number of steps it took for each generation
                writer.add_scalar("Training step - Generation number", generation_number, n_steps)
                writer.add_scalar("Training episode - Generation number", generation_number, e)

                # Store the score against the baseline policy for the generation
                writer.add_scalar("Average baseline test return - Generation number", average_test_return_baseline, generation_number)

                # Flush both sets of results
                writer.flush()


        # Reset the environment, extracting the initial states of the 2 agents
        state1 = env.reset()
        state2 = state1
        
        # Run an episode
        done = False
        total_return = 0
        while not done:  

            # Select the actions for each agent
            action1, prob1, val1 = agent1.select_action(torch.FloatTensor(state1).to(DEVICE))
            action2, prob2, val2 = agent2.select_action(torch.FloatTensor(state2).to(DEVICE))

            # Step the environment forward
            next_state1, reward, done, info = env.step(action1, otherAction=action2)
            next_state2 = info['otherObs']

            # Store the experience in the replay buffer
            # Use terminated to only learn the end of the episode if the episode is done
            agent1.remember(next_state1, action1, prob1, val1, reward, done)
            agent2.remember(next_state2, action2, prob2, val2, reward, done)

            # Update the states
            state1 = next_state1
            state2 = next_state2

            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Train the player agent only if the number of steps is a multiple of the horizon
            if n_steps > 0 and n_steps % horizon == 0:
                agent1.learn()

            # Incremenent the number of steps
            n_steps += 1
        
        # Print the training returns
        writer.add_scalar("TrainReturn-TrainStep", total_return, n_steps)
        writer.add_scalar("TrainReturn-TrainEpisode", total_return, e)
        writer.flush()

In [7]:
i = 0 # Keep track of the hyperparameter combinations

for alpha in alpha_choices:

    for beta in beta_choices:

        # Print the hyperparameters
        print(f"{i}) Alpha: {alpha}, Beta: {beta}")

        # Create the writer
        logging_dir = f"./Logging/PPO/{datetime.now().strftime('%Y%m%d-%H%M%S')}-alpha-{alpha}-beta-{beta}"
        writer = SummaryWriter(logging_dir)

        # Train the agent
        train(alpha=alpha, 
              beta=beta, 
              lamb=lamda, 
              gamma=gamma, 
              horizon=horizon, 
              batch_size=batch_size, 
              n_epochs=n_epochs, 
              clip=clip, 
              max_num_episodes=max_num_episodes,
              num_test_runs=num_test_runs, 
              num_episodes_before_test_runs=num_episodes_before_test_runs, 
              threshold_test_return_to_update_opponents=threshold_test_return_to_update_opponents, 
              writer=writer,
              logging_dir=logging_dir)
        
        # Close the writer
        writer.close()

0) Alpha: 0.0003, Beta: 0


 13%|█▎        | 128/1000 [45:13<5:08:03, 21.20s/it]


KeyboardInterrupt: 