In [1]:
import gym

env = gym.vector.make("Pendulum-v0", num_envs=8, asynchronous=True)
env.seed(0)



In [4]:
from sac import SAC

actor = SAC(env.single_observation_space, env.single_action_space)

In [7]:
import time
import math

def train(agent, env, total_timesteps):
    timesteps = []
    total_rewards = [[] for _ in range(env.num_envs)]
    avg_total_rewards = []

    total_reward = np.zeros(env.num_envs)
    observations = env.reset()
    timestep = 0
    episode = 0

    t = 0

    start_time = time.time()

    while timestep < total_timesteps:
        actions = agent.act(observations)
        next_observations, rewards, dones, _ = env.step(actions)
        agent.remember(observations, actions, rewards, next_observations, dones, n_envs=env.num_envs)
        agent.train(64)
        
        timestep += len(observations)
        timesteps.append(t)
        t += 1

        total_reward += rewards

        for i in range(env.num_envs):
            if dones[i]:
                total_rewards[i].append((t, timestep, total_reward[i]))
                episode += 1

        if any(G for G in total_rewards):
            episode_returns = sorted(
                list(np.concatenate([G for G in total_rewards if G])),
                key=lambda x: x[1]
            )

            avg_total_rewards.append(
                (t, timestep, np.mean([G[-1] for G in episode_returns[-100:]]))
            )

        total_reward *= 1 - dones
        observations = next_observations

        ratio = math.ceil(100 * timestep / total_timesteps)
        uptime = math.ceil(time.time() - start_time)

        avg_return = avg_total_rewards[-1][-1] if avg_total_rewards else np.nan

        print(f"[{ratio:3d}% / {uptime:3d}s] timestep = {timestep}/{total_timesteps}, episode = {episode:3d}, avg_return = {avg_return:10.4f}\r", end="")

    return np.array(timesteps), avg_total_rewards

In [9]:
timesteps, returns = train(actor, env, 50000)

[ 40% / 179s] timestep = 19640/50000, episode =  96, avg_return =  -208.2734

KeyboardInterrupt: 

In [16]:
def evaluate(agent, env, n_episodes=5, render=False):
    for episode in range(n_episodes):

        obs = env.reset()        
        total_reward = 0.0
        episode_length = 0

        done = False
        while not done:
            action = agent.act(obs)
            next_obs, reward, done, _ = env.step(action)
            obs = next_obs
            
            total_reward += reward
            episode_length += 1

            if render:
                env.render()
        
        if render:
            env.close()

        print(f">> episode = {episode + 1} / {n_episodes}, total_reward = {total_reward:10.4f}, episode_length = {episode_length}")

In [17]:
eval_env = gym.make("Pendulum-v0")
evaluate(actor, eval_env, 1, True)

>> episode = 1 / 1, total_reward =  -240.9904, episode_length = 200
