In [26]:
# Inspired from: https://github.com/hardmaru/slimevolleygym/blob/master/training_scripts/train_ppo.py
import os
import gym
import slimevolleygym
from datetime import datetime
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
import torch
from stable_baselines3.common.env_util import make_vec_env

In [27]:
NUM_TIMESTEPS = int(5e7)
SEED = 721
EVAL_EPISODES = 100
n_cpu = 50
EVAL_FREQ = 500000 // n_cpu
learning_rate=0.0007
n_steps=5
gamma=0.99
gae_lambda=1.0
ent_coef=0
vf_coef=0.5
max_grad_norm=0.5
rms_prop_eps=1e-05
use_rms_prop=True
use_sde=False
sde_sample_freq=-1
rollout_buffer_class=None
rollout_buffer_kwargs=None
normalize_advantage=False
stats_window_size=100
policy_kwargs=None
verbose=1
device=torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
_init_setup_model=True

# Log dir
LOGDIR = f"./Logging/A2C-BASELINE-LIBRARY/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{learning_rate}-entcoef-{ent_coef}"

In [None]:
vec_env = make_vec_env(slimevolleygym.SlimeVolleyEnv, n_envs=n_cpu, seed=SEED)

model = A2C("MlpPolicy", 
            vec_env, 
            learning_rate=learning_rate, 
            n_steps=n_steps, 
            gamma=gamma, 
            gae_lambda=gae_lambda, 
            ent_coef=ent_coef, 
            vf_coef=vf_coef, 
            max_grad_norm=max_grad_norm,
            rms_prop_eps=rms_prop_eps, 
            use_rms_prop=use_rms_prop, 
            use_sde=use_sde, 
            sde_sample_freq=sde_sample_freq, 
            rollout_buffer_class=rollout_buffer_class, 
            rollout_buffer_kwargs=rollout_buffer_kwargs,
            normalize_advantage=normalize_advantage, 
            stats_window_size=stats_window_size, 
            tensorboard_log=LOGDIR, 
            policy_kwargs=policy_kwargs, 
            verbose=verbose, 
            seed=SEED, 
            device=device,
            _init_setup_model=_init_setup_model)

eval_callback = EvalCallback(vec_env, 
                             best_model_save_path=LOGDIR, 
                             log_path=LOGDIR, 
                             eval_freq=EVAL_FREQ, 
                             n_eval_episodes=EVAL_EPISODES)

model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

model.save(os.path.join(LOGDIR, "final_model"))

Using cuda:0 device
Logging to ./Logging/A2C-BASELINE-LIBRARY/20240415-184854-lr-0.0007-entcoef-0/A2C_1




------------------------------------
| rollout/              |          |
|    ep_len_mean        | 447      |
|    ep_rew_mean        | -5       |
| time/                 |          |
|    fps                | 6679     |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -1.89    |
|    explained_variance | 0.847    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0226  |
|    value_loss         | 0.011    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 598      |
|    ep_rew_mean        | -4.95    |
| time/                 |          |
|    fps                | 6657     |
|    iterations         | 200      |
|    time_elapsed       | 7        |
|    total_timesteps    | 50000    |
| train/                |          |
|