In [1]:
# Insipired from: https://github.com/hardmaru/slimevolleygym/blob/master/training_scripts/train_ppo_selfplay.py

import os
import slimevolleygym
import numpy as np
from datetime import datetime
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.callbacks import BaseCallback
from shutil import copyfile # keep track of generations
import torch
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from slimevolleygym import BaselinePolicy
from torch.utils.tensorboard import SummaryWriter

In [2]:
SEED = 17
NUM_TIMESTEPS = int(5e7)
EVAL_EPISODES_SELFPLAY = 100
EVAL_EPISODES_BASELINE = 50
EVAL_EPISODES_RANDOM = 10
BEST_THRESHOLD = 0.5 # must achieve a mean score above this to replace prev best self
RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.
n_cpu = 50
EVAL_FREQ = 250000 // n_cpu
learning_rate=0.0007
n_steps=5
gamma=0.99
gae_lambda=1.0
ent_coef=0.1
vf_coef=0.5
max_grad_norm=0.5
rms_prop_eps=1e-05
use_rms_prop=True
use_sde=False
sde_sample_freq=-1
rollout_buffer_class=None
rollout_buffer_kwargs=None
normalize_advantage=False
stats_window_size=100
policy_kwargs=None
verbose=1
device=torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
_init_setup_model=True

# Log dir
LOGDIR = f"./Logging/A2C-SELFPLAY-LIBRARY/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{learning_rate}-entcoef-{ent_coef}"
os.mkdir(LOGDIR) 

In [3]:
# wrapper over the normal single player env, but loads the best self play model
class SlimeVolleySelfPlayEnv(slimevolleygym.SlimeVolleyEnv):
  
  def __init__(self):
    super(SlimeVolleySelfPlayEnv, self).__init__()
    self.policy = self
    self.best_model = None
    self.best_model_filename = None

  def predict(self, obs): # the policy
    if self.best_model is None:
      return self.action_space.sample() # return a random action
    else:
      action, _ = self.best_model.predict(obs)
      return action

  # load model if it's there
  def reset(self):
    modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
    modellist.sort()
    if len(modellist) > 0:
      filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
      if filename != self.best_model_filename:
        self.best_model_filename = filename
        if self.best_model is not None:
          del self.best_model
        self.best_model = A2C.load(filename, env=self, weights_only=False)
    return super(SlimeVolleySelfPlayEnv, self).reset()

# hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
# after saving model, resets the best score to be BEST_THRESHOLD
class SelfPlayCallback(EvalCallback):
  def __init__(self, *args, **kwargs):
    super(SelfPlayCallback, self).__init__(*args, **kwargs)
    self.best_mean_reward = BEST_THRESHOLD
    self.generation = 0
  def _on_step(self) -> bool:
    result = super(SelfPlayCallback, self)._on_step()
    if result and self.best_mean_reward > BEST_THRESHOLD:
      self.generation += 1
      source_file = os.path.join(LOGDIR, "best_model.zip")
      backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
      copyfile(source_file, backup_file)
      self.best_mean_reward = BEST_THRESHOLD
    return result

class BaselineEvalCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    Used to evaluate the agent against the baseline every certain number of iterations

    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, model, eval_freq, num_evals, n_cpu, selfplaycallback, verbose: int = 0):
        super().__init__(verbose)
        
        # Store the instance variables
        self.model = model
        self.eval_freq = eval_freq
        self.num_evals = num_evals
        self.baseline = BaselinePolicy()
        self.last_generation = 0
        self.selfplaycallback = selfplaycallback # Store the selfplay callback to access the generation number

        # Make the regular environment with the opponent being the baseline instead of selfplay for this callback
        self.vecenv = make_vec_env(slimevolleygym.SlimeVolleyEnv, n_envs=n_cpu, seed=SEED)

        # Create a summarywriter at the logdir
        self.writer = SummaryWriter(log_dir=LOGDIR)
        
    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: If the callback returns False, training is aborted early.
        """

        if self.n_calls > 0 and self.n_calls % self.eval_freq == 0:
            
            # Evaluate the model in the vectorized environment
            mean_reward, std_reward = evaluate_policy(self.model, self.vecenv, n_eval_episodes=self.num_evals)

            # Log the mean and std reward
            self.writer.add_scalar("Average baseline test return - Training step", mean_reward, self.n_calls)
            self.writer.add_scalar("Baseline test return standard deviation - Training step", std_reward, self.n_calls)

            # If the generation number increased
            if self.selfplaycallback.generation > self.last_generation:
                
                # Update the generation number to match
                self.last_generation = self.selfplaycallback.generation
               
                # Log the mean and std reward
                self.writer.add_scalar("Average baseline test return - Generation", mean_reward, self.last_generation)
                self.writer.add_scalar("Baseline test return standard deviation - Generation", mean_reward, self.last_generation)
        
        return True

def rollout(env, policy):
  """ play one agent vs the other in modified gym-style loop. """
  obs = env.reset()
 
  done = False
  total_reward = 0

  while not done:

    action, _states = policy.predict(obs)
    obs, reward, done, _ = env.step(action)

    total_reward += reward

    if RENDER_MODE:
      env.render()

  return total_reward

def train():

  vec_env = make_vec_env(SlimeVolleySelfPlayEnv, n_envs=n_cpu, seed=SEED)

  model = A2C("MlpPolicy", 
              vec_env, 
              learning_rate=learning_rate, 
              n_steps=n_steps, 
              gamma=gamma, 
              gae_lambda=gae_lambda, 
              ent_coef=ent_coef, 
              vf_coef=vf_coef, 
              max_grad_norm=max_grad_norm,
              rms_prop_eps=rms_prop_eps, 
              use_rms_prop=use_rms_prop, 
              use_sde=use_sde, 
              sde_sample_freq=sde_sample_freq, 
              rollout_buffer_class=rollout_buffer_class, 
              rollout_buffer_kwargs=rollout_buffer_kwargs,
              normalize_advantage=normalize_advantage, 
              stats_window_size=stats_window_size, 
              tensorboard_log=LOGDIR, 
              policy_kwargs=policy_kwargs, 
              verbose=verbose, 
              seed=SEED, 
              device=device,
              _init_setup_model=_init_setup_model)

  selfplay_eval_callback = SelfPlayCallback(vec_env,
                  best_model_save_path=LOGDIR,
                  log_path=LOGDIR,
                  eval_freq=EVAL_FREQ,
                  n_eval_episodes=EVAL_EPISODES_SELFPLAY)
  
  baseline_eval_callback = BaselineEvalCallback(model, 
                                                EVAL_FREQ, 
                                                EVAL_EPISODES_BASELINE, 
                                                n_cpu, 
                                                selfplay_eval_callback)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=[selfplay_eval_callback, baseline_eval_callback])
  model.save(os.path.join(LOGDIR, "final_model"))

if __name__=="__main__":
  train()



Using cuda:0 device
Logging to ./Logging/A2C-SELFPLAY-LIBRARY/20240416-120029-lr-0.0007-entcoef-0.1/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 447      |
|    ep_rew_mean        | -2.83    |
| time/                 |          |
|    fps                | 5508     |
|    iterations         | 100      |
|    time_elapsed       | 4        |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -2.08    |
|    explained_variance | 0.878    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0749  |
|    value_loss         | 0.0563   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 624      |
|    ep_rew_mean        | -0.0588  |
| time/                 |          |
|    fps                | 5715     |
|    iterations         | 200      |
|    t

UnpicklingError: Weights only load failed. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution.Do it only if you get the file from a trusted source. WeightsUnpickler error: Unsupported operand 71