# Imports

In [46]:
from Models.PPO.PPO_Agent import PPO_Agent
from Models.DDQN.DDQN_Agent import DDQN_Agent
from Models.DDQN.PRB import PrioritizedReplayBuffer
from stable_baselines3 import A2C
import slimevolleygym.mlp as mlp
from slimevolleygym.mlp import Model
import torch
import slimevolleygym
from slimevolleygym import BaselinePolicy
from utils import convert_to_vector, convert_to_value, convert_list_to_vectors
import types
import json
import numpy as np
from tqdm import tqdm
from stable_baselines3.common.env_util import make_vec_env
from IPython.display import clear_output
import pandas as pd

# Select Device

In [47]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Load all the models as a list

In [48]:
env = slimevolleygym.SlimeVolleyEnv()
models = []

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# PPO Baseline
agent = PPO_Agent(12, 6, DEVICE, mlp_layers=[64, 64])
agent.load_models("Logging/PPO-BASELINE/20240411-150526-lr-0.0003-entcoef-0.1-mlp-64-kl-0.03", 1, 18492436)
models.append({
    "name": "PPO - Expert training",
    "agent": agent
})

# PPO Selfplay
agent = PPO_Agent(12, 6, DEVICE, mlp_layers=[64, 64])
agent.load_models("Logging/PPO-SELFPLAY/20240410-171658-lr-0.0003-entcoef-0", 1, 18534177)
models.append({
    "name": "PPO - Selfplay",
    "agent": agent
})

# Genetic agent
agent = Model(mlp.games['slimevolleylite'])
with open('Logging/GENETIC-SELFPLAY/20240409-021844-numagents-128-totalnumgames-546000/game_546000') as f:
    d = json.load(f)
    agent.set_model_params(d[0])
def select_action(self, state, greedy=False):
    action = self.predict(state, mean_mode=greedy)
    action = (action > 0).astype(int) # Anything positive means a 1, 0 or negative means a 0
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Genetic - Selfplay",
    "agent": agent
})

# A2C Baseline
agent = A2C.load("Logging/A2C-BASELINE-LIBRARY/20240416-004821-lr-0.0007-entcoef-0.1/best_model", env,\
                  print_system_info=True, custom_objects={'observation_space': env.observation_space, 'action_space': env.action_space})
def select_action(self, state, greedy=False):
    action, _ = self.predict(state, deterministic=greedy)
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "A2C - Expert training",
    "agent": agent
})

# A2C Self-play
# TODO: Choose the model to load
agent = A2C.load("Logging/A2C-SELFPLAY-LIBRARY/20240416-192851-lr-0.0007-entcoef-0.1/history_00000080", env, \
                 print_system_info=True, custom_objects={'observation_space': env.observation_space, 'action_space': env.action_space})
def select_action(self, state, greedy=False):
    action, _ = self.predict(state, deterministic=greedy)
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "A2C - Selfplay",
    "agent": agent
})

# Baseline
agent = BaselinePolicy()
def select_action(self, state, greedy=False):
    action = self.predict(state)
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Expert baseline",
    "agent": agent
})

# Random agent
agent = BaselinePolicy()
def select_action(self, state, greedy=False):
    action = convert_to_value(env.action_space.sample())
    return action, None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Random baseline",
    "agent": agent
})


== CURRENT SYSTEM INFO ==
- OS: Windows-10-10.0.22631-SP0 10.0.22631
- Python: 3.10.4
- Stable-Baselines3: 2.3.0
- PyTorch: 2.2.2+cu118
- GPU Enabled: True
- Numpy: 1.23.1
- Cloudpickle: 3.0.0
- Gymnasium: 0.29.1
- OpenAI Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.4.0-144-generic-x86_64-with-glibc2.27 # 161~18.04.1-Ubuntu SMP Fri Feb 10 15:55:22 UTC 2023
- Python: 3.9.12
- Stable-Baselines3: 2.3.0
- PyTorch: 1.13.0+cu116
- GPU Enabled: True
- Numpy: 1.26.4
- Cloudpickle: 2.2.1
- Gymnasium: 0.29.1
- OpenAI Gym: 0.26.2

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
== CURRENT SYSTEM INFO ==
- OS: Windows-10-10.0.22631-SP0 10.0.22631
- Python: 3.10.4
- Stable-Baselines3: 2.3.0
- PyTorch: 2.2.2+cu118
- GPU Enabled: True
- Numpy: 1.23.1
- Cloudpickle: 3.0.0
- Gymnasium: 0.29.1
- OpenAI Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.4.0-144-generic-x86_64-with-glibc2.27 # 161~18.04.1-Ubuntu SMP Fri Feb 10 15:55:22 UTC 2023
- Python: 3.9.



# Writing a wrapper to evaluate the agents in parallel using sb3

In [49]:
class SlimeVolleyParallelWrapper(slimevolleygym.SlimeVolleyEnv):
  
  def __init__(self, opponent_agent):
    super(SlimeVolleyParallelWrapper, self).__init__()
    opponent_agent.evaluation_mode()
    self.opponent_agent = opponent_agent

  # We need to override this function with the opponent's policy
  # This gym env will then play with that opponent whenever it runs
  def predict(self, obs):
      action, _ = self.opponent_agent.select_action(obs, greedy=True)
      return action

We then modify the evaluate_policy function from the stable baselines 3 library:

In [50]:
# Source: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/evaluation.py
# Modified to be able to run our custom agents in a parallelized way (1 environment per CPU core)

import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import gymnasium as gym
import numpy as np

from stable_baselines3.common import type_aliases
from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecMonitor, is_vecenv_wrapped


def evaluate_policy(
    model,
    env: Union[gym.Env, VecEnv],
    n_eval_episodes: int = 10,
    render: bool = False,
    callback: Optional[Callable[[Dict[str, Any], Dict[str, Any]], None]] = None,
    reward_threshold: Optional[float] = None,
    return_episode_rewards: bool = False,
    warn: bool = True,
    combination = None # Modification: For printing the progress
) -> Union[Tuple[float, float], Tuple[List[float], List[int]]]:
    """
    Runs policy for ``n_eval_episodes`` episodes and returns average reward.
    If a vector env is passed in, this divides the episodes to evaluate onto the
    different elements of the vector env. This static division of work is done to
    remove bias. See https://github.com/DLR-RM/stable-baselines3/issues/402 for more
    details and discussion.

    .. note::
        If environment has not been wrapped with ``Monitor`` wrapper, reward and
        episode lengths are counted as it appears with ``env.step`` calls. If
        the environment contains wrappers that modify rewards or episode lengths
        (e.g. reward scaling, early episode reset), these will affect the evaluation
        results as well. You can avoid this by wrapping environment with ``Monitor``
        wrapper before anything else.

    :param model: The RL agent you want to evaluate. This can be any object
        that implements a `predict` method, such as an RL algorithm (``BaseAlgorithm``)
        or policy (``BasePolicy``).
    :param env: The gym environment or ``VecEnv`` environment.
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param deterministic: Whether to use deterministic or stochastic actions
    :param render: Whether to render the environment or not
    :param callback: callback function to do additional checks,
        called after each step. Gets locals() and globals() passed as parameters.
    :param reward_threshold: Minimum expected reward per episode,
        this will raise an error if the performance is not met
    :param return_episode_rewards: If True, a list of rewards and episode lengths
        per episode will be returned instead of the mean.
    :param warn: If True (default), warns user about lack of a Monitor wrapper in the
        evaluation environment.
    :return: Mean reward per episode, std of reward per episode.
        Returns ([float], [int]) when ``return_episode_rewards`` is True, first
        list containing per-episode rewards and second containing per-episode lengths
        (in number of steps).
    """

    # Modification: Set the model to evaluation mode
    model.evaluation_mode()

    is_monitor_wrapped = False
    # Avoid circular import
    from stable_baselines3.common.monitor import Monitor

    if not isinstance(env, VecEnv):
        env = DummyVecEnv([lambda: env])  # type: ignore[list-item, return-value]

    is_monitor_wrapped = is_vecenv_wrapped(env, VecMonitor) or env.env_is_wrapped(Monitor)[0]

    if not is_monitor_wrapped and warn:
        warnings.warn(
            "Evaluation environment is not wrapped with a ``Monitor`` wrapper. "
            "This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. "
            "Consider wrapping environment first with ``Monitor`` wrapper.",
            UserWarning,
        )

    n_envs = env.num_envs
    episode_rewards = []
    episode_lengths = []

    episode_counts = np.zeros(n_envs, dtype="int")
    # Divides episodes among different sub environments in the vector as evenly as possible
    episode_count_targets = np.array([(n_eval_episodes + i) // n_envs for i in range(n_envs)], dtype="int")

    current_rewards = np.zeros(n_envs)
    current_lengths = np.zeros(n_envs, dtype="int")
    observations = env.reset()
    episode_starts = np.ones((env.num_envs,), dtype=bool)
    while (episode_counts < episode_count_targets).any():
        actions, _ = model.select_action( # Modification: Use our function header
            observations,  # type: ignore[arg-type]
            greedy=True
        )
        actions = convert_list_to_vectors(actions.numpy())
        new_observations, rewards, dones, infos = env.step(actions)
        current_rewards += rewards
        current_lengths += 1
        for i in range(n_envs):
            if episode_counts[i] < episode_count_targets[i]:
                # unpack values so that the callback can access the local variables
                reward = rewards[i]
                done = dones[i]
                info = infos[i]
                episode_starts[i] = done

                if callback is not None:
                    callback(locals(), globals())

                if dones[i]:
                    if is_monitor_wrapped:
                        # Atari wrapper can send a "done" signal when
                        # the agent loses a life, but it does not correspond
                        # to the true end of episode
                        if "episode" in info.keys():
                            # Do not trust "done" with episode endings.
                            # Monitor wrapper includes "episode" key in info if environment
                            # has been wrapped with it. Use those rewards instead.
                            episode_rewards.append(info["episode"]["r"])
                            episode_lengths.append(info["episode"]["l"])
                            # Only increment at the real end of an episode
                            episode_counts[i] += 1
                            # Modification: Print the progress
                            clear_output(wait=True)
                            print(f"Model {combination[0]} VS Model {combination[1]} progress:({episode_counts[0]/episode_count_targets[0]:.2f}%)")
                    else:
                        episode_rewards.append(current_rewards[i])
                        episode_lengths.append(current_lengths[i])
                        episode_counts[i] += 1
                    current_rewards[i] = 0
                    current_lengths[i] = 0

        observations = new_observations

        if render:
            env.render()

    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    if reward_threshold is not None:
        assert mean_reward > reward_threshold, "Mean reward below threshold: " f"{mean_reward:.2f} < {reward_threshold:.2f}"
    if return_episode_rewards:
        return episode_rewards, episode_lengths
    return mean_reward, std_reward

# Evaluation loop

## Return evaluation

In [51]:
NUM_EVALUATIONS = 1000
LOGGING_DIR = "Logging/EVALUATION"
N_CPU = 50
SEED = 32

# Make a returns matrix
# Last dimension is to store each training episode
returns = np.zeros((len(models), len(models), NUM_EVALUATIONS))

for i in range(len(models)):
    for j in range(i+1, len(models)):

        # Extract the models
        agent1 = models[i]["agent"]
        agent2 = models[j]["agent"]

        # Run the evaluations
        # Set the model in evaluation mode
        agent1.evaluation_mode()
        agent2.evaluation_mode()

        # Vectorize the environment
        vec_env = make_vec_env(SlimeVolleyParallelWrapper, n_envs=N_CPU, seed=SEED, env_kwargs={"opponent_agent": agent2})

        # Run the evaluations
        rewards, _ = evaluate_policy(agent1, vec_env, n_eval_episodes=NUM_EVALUATIONS, return_episode_rewards=True, combination=(i,j))

        # Convert the rewards to a numpy array
        rewards = np.array(rewards)
        
        # Store the returns (The rewards are from the perspective of agent 1)
        returns[i, j] = rewards
        returns[j, i] = -rewards

# Save the returns
np.savez(f"{LOGGING_DIR}/eval_returns.npz", returns)

# Print the pairwise mean returns with standard deviation in table format with the model names as column and row headers
mean_returns = np.mean(returns, axis=-1)
std_returns = np.std(returns, axis=-1)
df = pd.DataFrame(mean_returns, columns=[model["name"] for model in models], index=[model["name"] for model in models])
df = df.applymap(lambda x: f"{x:.2f}")
df += " +- "
df += pd.DataFrame(std_returns, columns=[model["name"] for model in models], index=[model["name"] for model in models]).applymap(lambda x: f"{x:.2f}")
print(df)

Progress:(0.02%)


KeyboardInterrupt: 

## ELO evaluation

In [None]:
# Load the returns
# returns = np.load(f"{LOGGING_DIR}/eval_returns.npz")["arr_0"]

# Compute the new ELOs of both players
def calculate_elos(elo1, elo2, s1, s2, K=32):

    # Calculate the expected score
    expected_score = 1 / (1 + 10**((elo2 - elo1) / 400))

    # Calculate the new ELOs
    new_elo1 = elo1 + K * (s1 - expected_score)
    new_elo2 = elo2 + K * (s2 - (1 - expected_score))

    # Return both ELOs
    return new_elo1, new_elo2

# Initialize the ELOs at 12000
elos = np.zeros(len(models))

# Extract the array of (agent1, agent2, s1, s2)
# We do this to avoid replaying all the episodes
k = 0
games = np.zeros((len(models) * len(models) * NUM_EVALUATIONS, 4))
for i in range(len(models)):
    for j in range(i+1, len(models)):
        for e in range(NUM_EVALUATIONS):
            if returns[i, j, e] > 0:
                games[k] = np.array([i, j, 1, 0])
            elif returns[i, j, e] < 0:
                games[k] = np.array([i, j, 0, 1])
            else:
                games[k] = np.array([i, j, 0.5, 0.5])
            k += 1

# Shuffle the list of games and update the elos based on the results
np.random.shuffle(games)
for i, j, s1, s2 in games:
    elos[i], elos[j] = calculate_elos(elos[i], elos[j], s1, s2)

# Save the ELOs
np.savez(f"{LOGGING_DIR}/eval_elos.npz", elos)

# Print the ELOs
print("ELO computation complete!")
for i, model in enumerate(models):
    print(f"{model['name']} ELO: {elos[i]}")

# Load the ELOs
# elos = np.load(f"{LOGGING_DIR}/eval_elos.npz")["arr_0"]
