In [1]:
# Insipired from: https://github.com/hardmaru/slimevolleygym/blob/master/training_scripts/train_ppo_selfplay.py

import os
import slimevolleygym
import numpy as np
from datetime import datetime
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from shutil import copyfile # keep track of generations
import torch

In [4]:
# Settings
SEED = 17
NUM_TIMESTEPS = int(2e7)
EVAL_FREQ = 250000
EVAL_EPISODES = 100
BEST_THRESHOLD = 0.5 # must achieve a mean score above this to replace prev best self
RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.

# Hyperparameters
learning_rate=0.0007
n_steps=5
gamma=0.99
gae_lambda=1.0
ent_coef=0.0
vf_coef=0.5
max_grad_norm=0.5
rms_prop_eps=1e-05
use_rms_prop=True
use_sde=False
sde_sample_freq=-1
rollout_buffer_class=None
rollout_buffer_kwargs=None
normalize_advantage=False
stats_window_size=100
policy_kwargs=None
verbose=0
device=torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
_init_setup_model=True

# Log dir
LOGDIR = f"./Logging/A2C-SELFPLAY-LIBRARY/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{learning_rate}-entcoef-{ent_coef}"
os.mkdir(LOGDIR) 

SyntaxError: cannot assign to literal (2679061148.py, line 28)

In [3]:
# wrapper over the normal single player env, but loads the best self play model
class SlimeVolleySelfPlayEnv(slimevolleygym.SlimeVolleyEnv):
  
  def __init__(self):
    super(SlimeVolleySelfPlayEnv, self).__init__()
    self.policy = self
    self.best_model = None
    self.best_model_filename = None

  def predict(self, obs): # the policy
    if self.best_model is None:
      return self.action_space.sample() # return a random action
    else:
      action, _ = self.best_model.predict(obs)
      return action

  # load model if it's there
  def reset(self):
    modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
    modellist.sort()
    if len(modellist) > 0:
      filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
      if filename != self.best_model_filename:
        print("loading model: ", filename)
        self.best_model_filename = filename
        if self.best_model is not None:
          del self.best_model
        self.best_model = A2C.load(filename, env=self)
    return super(SlimeVolleySelfPlayEnv, self).reset()

# hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
# after saving model, resets the best score to be BEST_THRESHOLD
class SelfPlayCallback(EvalCallback):
  def __init__(self, *args, **kwargs):
    super(SelfPlayCallback, self).__init__(*args, **kwargs)
    self.best_mean_reward = BEST_THRESHOLD
    self.generation = 0
  def _on_step(self) -> bool:
    result = super(SelfPlayCallback, self)._on_step()
    if result and self.best_mean_reward > BEST_THRESHOLD:
      self.generation += 1
      print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
      print("SELFPLAY: new best model, bumping up generation to", self.generation)
      source_file = os.path.join(LOGDIR, "best_model.zip")
      backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
      copyfile(source_file, backup_file)
      self.best_mean_reward = BEST_THRESHOLD
    return result

def rollout(env, policy):
  """ play one agent vs the other in modified gym-style loop. """
  obs = env.reset()
 
  done = False
  total_reward = 0

  while not done:

    action, _states = policy.predict(obs)
    obs, reward, done, _ = env.step(action)

    total_reward += reward

    if RENDER_MODE:
      env.render()

  return total_reward

def train():

  env = SlimeVolleySelfPlayEnv()
  env.seed(SEED)

  model = A2C("MlpPolicy", 
              env, 
              learning_rate=learning_rate, 
              n_steps=n_steps, 
              gamma=gamma, 
              gae_lambda=gae_lambda, 
              ent_coef=ent_coef, 
              vf_coef=vf_coef, 
              max_grad_norm=max_grad_norm,
              rms_prop_eps=rms_prop_eps, 
              use_rms_prop=use_rms_prop, 
              use_sde=use_sde, 
              sde_sample_freq=sde_sample_freq, 
              rollout_buffer_class=rollout_buffer_class, 
              rollout_buffer_kwargs=rollout_buffer_kwargs,
              normalize_advantage=normalize_advantage, 
              stats_window_size=stats_window_size, 
              tensorboard_log=LOGDIR, 
              policy_kwargs=policy_kwargs, 
              verbose=verbose, 
              seed=SEED, 
              device=device,
              _init_setup_model=_init_setup_model)

  eval_callback = SelfPlayCallback(env,
                  best_model_save_path=LOGDIR,
                  log_path=LOGDIR,
                  eval_freq=EVAL_FREQ,
                  n_eval_episodes=EVAL_EPISODES)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)
  model.save(os.path.join(LOGDIR, "final_model"))
  env.close()

if __name__=="__main__":
  train()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to ./Logging/A2C-SELFPLAY-LIBRARY/20240415-094421\A2C_1




------------------------------------
| time/                 |          |
|    fps                | 249      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -2.08    |
|    explained_variance | -298     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.00401 |
|    value_loss         | 0.00108  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 864      |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 273      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -2.04    |
|    explained_variance | -282     |
|    learning_rate      | 0.0007   |
|

KeyboardInterrupt: 