# Imports

In [5]:
import gym
import slimevolleygym
from Models.PPO.PPO_Agent import PPO_Agent
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

# For tensorboard logging
now = datetime.now()
logdir = "Logging/PPO/Tensorboard/" + now.strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir=logdir)

# Setup environment

In [8]:
env = gym.make("SlimeVolley-v0")
print(f"Action space: {env.action_space.n}")
print(f"Observation space: {env.observation_space.shape}")
env.close()

Action space: 3
Observation space: (12,)


# Device

In [9]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Hyperparameters setup

In [10]:
seed = 42
batch_size = 5
n_epochs = 4
alpha = 0.0003
lamda = 0.95
gamma = 0.99
clip = 0.2 # From the paper
horizon = 20 # Number of steps before training the agent

max_num_steps = 1000000
num_test_runs = 10
num_episodes_before_test_runs = 100
threshold_test_return_to_update_opponents = 0.5
time_scale = 90
num_steps_before_save = 500000

In [None]:
def evaluate(env, agent1, agent2, num_eval_episodes):

    # Set the model in evaluation mode
    agent1.actor.eval()
    agent2.actor.eval()
    
    # Run num_eval_episodes episodes and calculate the total return
    total_return = 0
    for _ in tqdm(range(num_eval_episodes)):

        state, _ = env.reset()
        done = False
        while not done:
            
            with torch.no_grad():

                # Select the actions for each agent
                action1, _, _ = agent1.select_action(torch.FloatTensor(state).to(DEVICE), greedy=True)
                action2, _, _ = agent2.select_action(torch.FloatTensor(state).to(DEVICE), greedy=True)
            
            # Step the environment forward
            next_states, reward, terminated, truncated, _ = env.step(action1.item(), action2.item())

            # Check if they're done
            done = terminated or truncated
            
            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Update the states
            state = next_states
    
    # Set the model back in training mode
    agent1.actor.train()
    agent2.actor.train()

    # Return the average return
    return total_return / num_eval_episodes

In [None]:
def train(alpha, lamb, gamma, horizon, batch_size, n_epochs, clip, max_num_steps, num_test_runs, num_episodes_before_test_runs, threshold_test_return_to_update_opponents, time_scale, num_steps_before_save):

    # Create the environment
    env = gym.make("CartPole-v1")
    torch.manual_seed(seed)
    # env.seed(seed)

    # Create the player agents
    agent1 = PPO_Agent(state_size=env.observation_space.shape,
                        action_size=env.action_space.n,
                        alpha = alpha,
                        lamda = lamb,
                        gamma = gamma,
                        horizon = horizon,
                        batch_size = batch_size,
                        num_epoch = n_epochs,
                        clip = clip,
                        device=DEVICE)
    
    agent2 = PPO_Agent(state_size=env.observation_space.shape,
                        action_size=env.action_space.n,
                        alpha = alpha,
                        lamda = lamb,
                        gamma = gamma,
                        horizon = horizon,
                        batch_size = batch_size,
                        num_epoch = n_epochs,
                        clip = clip,
                        device=DEVICE)

    # Store the best test return
    n_steps = 0
    e = 0 # Episode number
    # Train the agent
    while n_steps < max_num_steps:

        if n_steps > 0 and n_steps % num_steps_before_save == 0:
            agent1.save_models("./Logging/PPO-CARTPOLE/Checkpoints", 1, n_steps)
            agent2.save_models("./Logging/PPO-CARTPOLE/Checkpoints", 2, n_steps)
        
        if e % num_episodes_before_test_runs == 0:
            average_test_return = evaluate(env, agent1, agent2, num_test_runs)
            writer.add_scalar("AverageTestReturn-TrainStep", average_test_return, n_steps)
            writer.add_scalar("AverageTestReturn-TrainEpisode", average_test_return, e)
            writer.flush()

        # Reset the environment, extracting the initial states of all 4 agents
        state, _ = env.reset()
        
        # Run an episode
        done = False
        total_return = 0
        while not done:  

            # Select the actions for each agent
            action1, prob1, val1 = agent1.select_action(torch.FloatTensor(state).to(DEVICE))
            action2, prob2, val2 = agent1.select_action(torch.FloatTensor(state).to(DEVICE))

            # Step the environment forward
            next_state, reward, terminated, truncated, info = env.step(action1, action2)

            # TODO: From here
            
            # Check if the episode is done
            done = terminated or truncated

            # Store the experience in the replay buffer
            # Use terminated to only learn the end of the episode if the episode is done
            agent.remember(next_state, action, prob, val, reward, terminated)

            # Update the states
            state = next_state

            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Train the agents if the number of steps is a multiple of the horizon
            if n_steps > 0 and n_steps % horizon == 0:
                agent.learn()

            # Incremenent the number of steps
            n_steps += 1
        
        # Increment the episode number
        e += 1
        
        # Print the training returns
        writer.add_scalar("TrainReturn-TrainStep", total_return, n_steps)
        writer.add_scalar("TrainReturn-TrainEpisode", total_return, e)
        writer.flush()

In [None]:
train(alpha, lamda, gamma, horizon, batch_size, n_epochs, clip,\
       max_num_steps, num_test_runs, num_episodes_before_test_runs,\
          threshold_test_return_to_update_opponents, time_scale, num_steps_before_save)

100%|██████████| 10/10 [00:00<00:00, 75.00it/s]
100%|██████████| 10/10 [00:00<00:00, 37.08it/s]
100%|██████████| 10/10 [00:02<00:00,  4.22it/s]
100%|██████████| 10/10 [00:05<00:00,  1.89it/s]
100%|██████████| 10/10 [00:03<00:00,  3.05it/s]
100%|██████████| 10/10 [00:01<00:00,  5.56it/s]
 10%|█         | 1/10 [00:00<00:02,  3.42it/s]