In [1]:
import slimevolleygym
from Models.A2C.A2C_Agent import A2C_Agent
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from slimevolleygym import BaselinePolicy
import numpy as np
from utils import convert_to_vector, convert_to_value
import types
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = slimevolleygym.SlimeVolleyEnv()
print(f"Action space: {env.action_space.n}")
print(f"Observation space: {env.observation_space.shape}")
env.close()

Action space: 3
Observation space: (12,)


In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cpu


In [4]:
# Hyperparameters
timesteps_per_batch = 4096    # Number of timesteps to run per batch
n_updates_per_iteration = 10  # Number of times to update actor/critic per iteration
lr_choices = [3e-4]           # Learning rate of both actor and critic optimizers
eps = 1e-5                    # Adam optimizer epsilon
gamma = 0.99                  # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2                    # Recommended 0.2, helps define the threshold to clip the ratio during SGA
lam = 0.95                    # Lambda Parameter for GAE 
num_minibatches = 6           # Number of mini-batches for Mini-batch Update
ent_coef = 0                  # Entropy coefficient for Entropy Regularization
target_kl = 0.03              # KL Divergence threshold
max_grad_norm = 0.5           # Gradient Clipping threshold
mlp_layers = [64, 64]         # Number of neurons in each layer of the MLP
render = False                # Whether to render the environment

# Custom parameters
seed = 42
max_num_steps = 20000000
num_test_runs_vs_baseline = 50
num_test_runs_vs_random = 10
num_iterations_before_test_runs = 150
num_iterations_before_save = 100

In [5]:

def evaluate(env, agent1, num_eval_episodes, agent2="random"):

    
    agent1.evaluation_mode()
    

    total_return = 0
    for _ in range(num_eval_episodes):

        state1 = env.reset()
        state2 = state1
        done = False
        while not done:
            
            with torch.no_grad():

              
                action1, _ = agent1.select_action(state1, greedy=True)

                if agent2 == "random":
                    action2 = convert_to_value(env.action_space.sample())
                else:
                    action2, _ = agent2.select_action(state2)
            
            
            next_state1, reward, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
            next_state2 = info['otherObs']
            
          
            total_return += reward

          
            state1 = next_state1
            state2 = next_state2
    

    agent1.training_mode()


    return total_return / num_eval_episodes

In [6]:
def train_a2c(env, agent1, timesteps_per_actor, lr, eps, gamma, lam, ent_coef, max_grad_norm, seed, max_num_steps, 
              num_test_runs_vs_baseline, num_iterations_before_test_runs, num_iterations_before_save, 
              num_test_runs_vs_random, writer, logging_dir, mlp_layers, render, HP_string, timesteps_per_batch):
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    
  
    
    agent2 = BaselinePolicy()  
    def select_action(self, state, greedy=False):
        action = self.predict(state)
        return convert_to_value(action), None
    agent2.select_action = types.MethodType(select_action, agent2)
    
    n_steps = 0
    i = 0  
    
    while n_steps < max_num_steps:
        clear_output(wait=True)
        print(f"{HP_string} Training step {n_steps}/{max_num_steps} ({n_steps/max_num_steps*100:.2f}%)")
        
        #batch_obs, batch_acts, batch_log_probs, batch_rews, batch_vals, batch_dones = agent1.gather_data(env, agent2)
        batch_obs, batch_acts, batch_log_probs, batch_rews, batch_lens, batch_vals, batch_dones = agent1.gather_data(env, agent2)
       
        agent1.learn(batch_obs=batch_obs,
                     batch_acts=batch_acts,
                     batch_log_probs=batch_log_probs,  
                     batch_rews=batch_rews, 
                     batch_vals=batch_vals, 
                     batch_dones=batch_dones,
                     writer=writer,
                     n_steps_so_far=n_steps)
        
        
        #n_steps += timesteps_per_actor  
        n_steps += np.sum(batch_lens)
       
        if (i + 1) % num_iterations_before_test_runs == 0:
            average_test_return_baseline = evaluate(env, agent1, num_test_runs_vs_baseline, agent2)
            writer.add_scalar("Evaluation/Average baseline test return", average_test_return_baseline, n_steps)
            
            average_test_return_random = evaluate(env, agent1, num_test_runs_vs_random, "random")
            writer.add_scalar("Evaluation/Average random test return", average_test_return_random, n_steps)
        
        
        if (i + 1) % num_iterations_before_save == 0:
            agent1.save_models(logging_dir, 1, n_steps)
        
        writer.flush()
        i += 1  
    
    
    agent1.save_models(logging_dir, 1, n_steps)
    writer.close()


In [7]:
i = 0 

for lr in lr_choices:
    
    HP_string = f"{i}) LR: {lr}, Ent_Coef: {ent_coef}, Layers: {mlp_layers[0]}, MaxGradNorm: {max_grad_norm}"

    
    logging_dir = f"./Logging/A2C-BASELINE/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{lr}-entcoef-{ent_coef}-mlp-{mlp_layers[0]}-maxgradnorm-{max_grad_norm}"
    writer = SummaryWriter(logging_dir)

    
    agent = A2C_Agent(obs_dim=env.observation_space.shape[0], 
                      act_dim=env.action_space.n, 
                      DEVICE=DEVICE, 
                      timesteps_per_actor=timesteps_per_batch, 
                      n_actors=16,  
                      lr=lr, 
                      eps=eps, 
                      gamma=gamma, 
                      lam=lam, 
                      ent_coef=ent_coef, 
                      max_grad_norm=max_grad_norm, 
                      mlp_layers=mlp_layers, 
                      render=render,
                      timesteps_per_batch=timesteps_per_batch)

    
    train_a2c(env=env,
              agent1=agent, 
              timesteps_per_actor=timesteps_per_batch, 
              lr=lr, 
              eps=eps, 
              gamma=gamma, 
              lam=lam, 
              ent_coef=ent_coef, 
              max_grad_norm=max_grad_norm, 
              seed=seed, 
              max_num_steps=max_num_steps, 
              num_test_runs_vs_baseline=num_test_runs_vs_baseline, 
              num_iterations_before_test_runs=num_iterations_before_test_runs, 
              num_iterations_before_save=num_iterations_before_save, 
              num_test_runs_vs_random=num_test_runs_vs_random, 
              writer=writer, 
              logging_dir=logging_dir, 
              mlp_layers=mlp_layers, 
              render=render, 
              HP_string=HP_string,
              timesteps_per_batch=timesteps_per_batch)

   
    writer.close()

    

0) LR: 0.0003, Ent_Coef: 0, Layers: 64, MaxGradNorm: 0.5 Training step 70165/20000000 (0.35%)


Exception in thread Thread-5:
Traceback (most recent call last):
  File "c:\Users\Admin\Desktop\579final\SlimeVolleyball\SlimeVolleyball\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_7788\3893934939.py", line 28, in <module>
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_7788\2665278339.py", line 51, in train_a2c
  File "c:\Users\Admin\Desktop\579final\SlimeVolleyball\SlimeVolleyball\.venv\lib\site-packages\torch\utils\tensorboard\writer.py", line 1256, in flush
    writer.flush()
  File "c:\Users\Admin\Desktop\579final\SlimeVolleyball\SlimeVolleyball\.venv\lib\site-packages\torch\utils\tensorboard\writer.py", line 152, in flush
    self.event_writer.flush()
  File "c:\Users\Admin\Desktop\579final\SlimeVolleyball\SlimeVolleyball\.venv\lib\site-packages\tensorboard\summary\writer\event_file_writer.py", line 125, in flush
    self._async_write

OverflowError: timeout value is too large