# Imports

In [1]:
from Models.PPO.PPO_Agent import PPO_Agent
from Models.DDQN.DDQN_Agent import DDQN_Agent
from Models.DDQN.PRB import PrioritizedReplayBuffer
from stable_baselines3 import A2C
import slimevolleygym.mlp as mlp
from slimevolleygym.mlp import Model
import torch
import slimevolleygym
from slimevolleygym import BaselinePolicy
from utils import convert_to_vector, convert_to_value
import types
import json
import numpy as np
from tqdm import tqdm

# Select Device

In [2]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Load all the models as a list

In [3]:
env = slimevolleygym.SlimeVolleyEnv()
models = []

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# PPO Baseline
agent = PPO_Agent(12, 6, DEVICE, mlp_layers=[64, 64])
agent.load_models("Logging/PPO-BASELINE/20240411-150526-lr-0.0003-entcoef-0.1-mlp-64-kl-0.03", 1, 18492436)
models.append({
    "name": "PPO - Expert training",
    "agent": agent
})

# PPO Selfplay
agent = PPO_Agent(12, 6, DEVICE, mlp_layers=[64, 64])
agent.load_models("Logging/PPO-SELFPLAY/20240410-171658-lr-0.0003-entcoef-0", 1, 18534177)
models.append({
    "name": "PPO - Selfplay training",
    "agent": agent
})

# Genetic agent
agent = Model(mlp.games['slimevolleylite'])
with open('Logging/GENETIC-SELFPLAY/20240409-021844-numagents-128-totalnumgames-546000/game_546000') as f:
    d = json.load(f)
    agent.set_model_params(d[0])
def select_action(self, state, greedy=False):
    action = self.predict(state, mean_mode=greedy)
    action = (action > 0).astype(int) # Anything positive means a 1, 0 or negative means a 0
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Genetic - Selfplay training",
    "agent": agent
})

# A2C Baseline
# TODO: Choose the model to load
agent = A2C.load("Logging/A2C-BASELINE-LIBRARY/20240415-184854-lr-0.0007-entcoef-0/best_model", env,\
                  print_system_info=True, custom_objects={'observation_space': env.observation_space, 'action_space': env.action_space})
def select_action(self, state, greedy=False):
    action, _ = self.predict(state, deterministic=greedy)
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "A2C - Expert training",
    "agent": agent
})

# A2C Self-play
# TODO: Choose the model to load
# agent = A2C.load("Logging/A2C-BASELINE-LIBRARY/20240415-184854-lr-0.0007-entcoef-0/final_model", print_system_info=True)
# def select_action(self, state, greedy=False):
#     action, _ = self.predict(state, deterministic=greedy)
#     return convert_to_value(action), None
# def evaluation_mode(self):
#     pass
# agent.select_action = types.MethodType(select_action, agent)
# agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
# models.append({
#     "name": "A2C - Selfplay training",
#     "agent": agent
# })

# DDQN Baseline
# TODO: Choose the model to load
# buffer = PrioritizedReplayBuffer(
#         buffer_size = 1, 
#         state_dim = 1, 
#         alpha = 1, 
#         beta_init = 1, 
#         device = DEVICE
#     )
# agent = DDQN_Agent(state_dim = 12,
#                     action_dim = 6,
#                     hidden_layer_shape = 256,
#                     device = DEVICE,
#                     lr = 0,
#                     gamma = 0,
#                     batch_size = 0,
#                     epsilon = 0)
# agent.load("Logging/DDQN-BASELINE/20240414-052734-lr-0.0005", 1, 10589200, buffer)
# agent.select_action_original = agent.select_action
# def select_action(self, state, greedy=False, deterministic=False):
#     action = self.select_action_original(state, deterministic=greedy)
#     return action, None
# agent.select_action = types.MethodType(select_action, agent)
# models.append({
#     "name": "DDQN - Expert training",
#     "agent": agent
# })

# DDQN Self-play
# TODO: Choose the model to load
# buffer = PrioritizedReplayBuffer(
#         buffer_size = 1, 
#         state_dim = 1, 
#         alpha = 1, 
#         beta_init = 1, 
#         device = DEVICE
#     )
# agent = DDQN_Agent(state_dim = 12,
#                     action_dim = 6,
#                     hidden_layer_shape = 256,
#                     device = DEVICE,
#                     lr = 0,
#                     gamma = 0,
#                     batch_size = 0,
#                     epsilon = 0)
# agent.load("Logging/DDQN-BASELINE/20240414-052734-lr-0.0005", 1, 10589200, buffer)
# agent.select_action_original = agent.select_action
# def select_action(self, state, greedy=False, deterministic=False):
#     action = self.select_action_original(state, deterministic=greedy)
#     return action, None
# agent.select_action = types.MethodType(select_action, agent)
# models.append({
#     "name": "DDQN - Selfplay training",
#     "agent": agent
# })

# Baseline
agent = BaselinePolicy()
def select_action(self, state, greedy=False):
    action = self.predict(state)
    return convert_to_value(action), None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Expert baseline",
    "agent": agent
})

# Random agent
agent = BaselinePolicy()
def select_action(self, state, greedy=False):
    action = convert_to_value(env.action_space.sample())
    return action, None
def evaluation_mode(self):
    pass
agent.select_action = types.MethodType(select_action, agent)
agent.evaluation_mode = types.MethodType(evaluation_mode, agent)
models.append({
    "name": "Random baseline",
    "agent": agent
})


== CURRENT SYSTEM INFO ==
- OS: Windows-10-10.0.22631-SP0 10.0.22631
- Python: 3.10.4
- Stable-Baselines3: 2.3.0
- PyTorch: 2.2.2+cu118
- GPU Enabled: True
- Numpy: 1.23.1
- Cloudpickle: 3.0.0
- Gymnasium: 0.29.1
- OpenAI Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.4.0-144-generic-x86_64-with-glibc2.27 # 161~18.04.1-Ubuntu SMP Fri Feb 10 15:55:22 UTC 2023
- Python: 3.9.12
- Stable-Baselines3: 2.3.0
- PyTorch: 1.13.0+cu116
- GPU Enabled: True
- Numpy: 1.26.4
- Cloudpickle: 2.2.1
- Gymnasium: 0.29.1
- OpenAI Gym: 0.26.2

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




# Evaluation loop

## Return evaluation

In [4]:
NUM_EVALUATIONS = 1000
LOGGING_DIR = "Logging/EVALUATION"

# Make a returns matrix
# Last dimension is to store each training episode
returns = np.zeros((len(models), len(models), NUM_EVALUATIONS))

for i in range(len(models)):
    for j in range(i+1, len(models)):

        # Extract the models
        agent1 = models[i]["agent"]
        agent2 = models[j]["agent"]

        # Run the evaluations
        # Set the model in evaluation mode
        agent1.evaluation_mode()
        agent2.evaluation_mode()
        
        # Run num_eval_episodes episodes and calculate the total return
        for e in tqdm(range(NUM_EVALUATIONS)):
            
            # Initialize the variables
            state1 = env.reset()
            state2 = state1
            done = False
            total_return = 0

            while not done:
                
                # Select the actions for each agent
                with torch.no_grad():
                    action1, _ = agent1.select_action(state1, greedy=True)
                    action2, _ = agent2.select_action(state2, greedy=True)
                
                # Step the environment forward
                next_state1, reward, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
                next_state2 = info['otherObs']
                
                # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
                total_return += reward

                # Update the states
                state1 = next_state1
                state2 = next_state2
        
            # Store the returns
            returns[i, j, e] = total_return
            returns[j, i, e] = -total_return

# Save the returns
np.savez(f"{LOGGING_DIR}/eval_returns.npz", returns)

## ELO evaluation

In [7]:
# Load the returns
# returns = np.load(f"{LOGGING_DIR}/eval_returns.npz")["arr_0"]

# Compute the new ELOs of both players
def calculate_elos(elo1, elo2, s1, s2, K=32):

    # Calculate the expected score
    expected_score = 1 / (1 + 10**((elo2 - elo1) / 400))

    # Calculate the new ELOs
    new_elo1 = elo1 + K * (s1 - expected_score)
    new_elo2 = elo2 + K * (s2 - (1 - expected_score))

    # Return both ELOs
    return new_elo1, new_elo2

# Initialize the ELOs at 12000
elos = np.zeros(len(models))

# Extract the array of (agent1, agent2, s1, s2)
# We do this to avoid replaying all the episodes
k = 0
games = np.zeros((len(models) * len(models) * NUM_EVALUATIONS, 4))
for i in range(len(models)):
    for j in range(i+1, len(models)):
        for e in range(NUM_EVALUATIONS):
            if returns[i, j, e] > 0:
                games[k] = np.array([i, j, 1, 0])
            elif returns[i, j, e] < 0:
                games[k] = np.array([i, j, 0, 1])
            else:
                games[k] = np.array([i, j, 0.5, 0.5])
            k += 1

# Shuffle the list of games and update the elos based on the results
np.random.shuffle(games)
for i, j, s1, s2 in games:
    elos[i], elos[j] = calculate_elos(elos[i], elos[j], s1, s2)

# Save the ELOs
np.savez(f"{LOGGING_DIR}/eval_elos.npz", elos)

# Load the ELOs
# elos = np.load(f"{LOGGING_DIR}/eval_elos.npz")["arr_0"]
