In [1]:
import warnings ; warnings.filterwarnings('ignore')
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from IPython.display import display
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from itertools import cycle, count
from textwrap import wrap

import matplotlib
import subprocess
import os.path
import tempfile
import random
import base64
import pprint
import glob
import time
import json
import sys
import gym
import io
import os
import gc

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

from zoo.actor_critic_agents import *
from zoo.advanced_actor_critic_agents import *
from zoo.exploration_strategies import *
from zoo.replay_buffers import *
from zoo.utils import *

SEEDS = (12, 34, 56)

%matplotlib inline

In [4]:
ddpg_results = []
best_agent, best_eval_score = None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'Pendulum-v0',
        'gamma': 0.99,
        'max_minutes': 5,
        'max_episodes': 500,
        'goal_mean_100_reward': -150
    }

    policy_model_fn = lambda nS, bounds: FCDP(nS, bounds, hidden_dims=(256,256))
    policy_max_grad_norm = float('inf')
    policy_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    policy_optimizer_lr = 0.0003

    value_model_fn = lambda nS, nA: FCQV(nS, nA, hidden_dims=(256,256))
    value_max_grad_norm = float('inf')
    value_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0003

    training_strategy_fn = lambda bounds: NormalNoiseStrategy(bounds, exploration_noise_ratio=0.1)
    evaluation_strategy_fn = lambda bounds: GreedyStrategyC(bounds)

    replay_buffer_fn = lambda: ReplayBuffer(m_size=100000, batch_size=256)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.005
    
    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()

    agent = DDPG(replay_buffer_fn,
                 policy_model_fn, 
                 policy_max_grad_norm, 
                 policy_optimizer_fn, 
                 policy_optimizer_lr,
                 value_model_fn, 
                 value_max_grad_norm, 
                 value_optimizer_fn, 
                 value_optimizer_lr, 
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_target_every_steps,
                 tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    ddpg_results.append(result)
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent
ddpg_results = np.array(ddpg_results)
_ = BEEP()

[2Kel 00:00:00, ep 0000, ts 0000200, ar 10 -1296.7±000.0, 100 -1296.7±000.0, ex 100 0.3±0.0, ev -1391.7±000.0
[2Kel 00:01:01, ep 0029, ts 0006000, ar 10 -163.7±096.1, 100 -798.0±631.2, ex 100 0.1±0.1, ev -785.3±601.3
[2Kel 00:02:02, ep 0053, ts 0010800, ar 10 -161.0±053.1, 100 -510.1±571.4, ex 100 0.1±0.1, ev -491.8±557.3
[2Kel 00:03:03, ep 0076, ts 0015400, ar 10 -168.5±090.7, 100 -407.9±506.2, ex 100 0.1±0.1, ev -390.7±494.0
[2Kel 00:04:05, ep 0099, ts 0020000, ar 10 -156.2±086.5, 100 -349.7±458.5, ex 100 0.1±0.1, ev -330.6±449.5
[2Kel 00:04:51, ep 0116, ts 0023400, ar 10 -144.7±069.7, 100 -176.4±174.4, ex 100 0.0±0.0, ev -146.3±097.0
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score -125.91±78.89 in 274.11s training time, 310.43s wall-clock time.

[2Kel 00:00:00, ep 0000, ts 0000200, ar 10 -1373.7±000.0, 100 -1373.7±000.0, ex 100 0.3±0.0, ev -1743.3±000.0
[2Kel 00:01:01, ep 0026, ts 0005400, ar 10 -260.7±252.5, 100 -906.3±538.2, ex 100 0.1±0.1, ev -100

In [3]:
class RenderUint8(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
    def reset(self, **kwargs):
        return self.env.reset(**kwargs)
    def render(self, mode='rgb_array'):
        frame = self.env.render(mode=mode)
        return frame.astype(np.uint8)

In [7]:
td3_results = []
best_agent, best_eval_score = None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'Pendulum-v0',
        'gamma': 0.99,
        'max_minutes': 5,
        'max_episodes': 500,
        'goal_mean_100_reward': -150
    }
    
    policy_model_fn = lambda nS, bounds: FCDP(nS, bounds, hidden_dims=(256,256))
    policy_max_grad_norm = float('inf')
    policy_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    policy_optimizer_lr = 0.0003

    value_model_fn = lambda nS, nA: FCTQV(nS, nA, hidden_dims=(256,256))
    value_max_grad_norm = float('inf')
    value_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0003

    training_strategy_fn = lambda bounds: NormalNoiseDecayStrategy(bounds,
                                                                   init_noise_ratio=0.5,
                                                                   min_noise_ratio=0.1,
                                                                   decay_steps=200000)
    evaluation_strategy_fn = lambda bounds: GreedyStrategyC(bounds)

    replay_buffer_fn = lambda: ReplayBuffer(m_size=1000000, batch_size=256)
    n_warmup_batches = 5
    update_value_target_every_steps = 2
    update_policy_target_every_steps = 2
    train_policy_every_steps = 2
    policy_noise_ratio = 0.1
    policy_noise_clip_ratio = 0.5
    tau = 0.005

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()

    agent = TD3(replay_buffer_fn,
                policy_model_fn, 
                policy_max_grad_norm, 
                policy_optimizer_fn, 
                policy_optimizer_lr,
                value_model_fn,
                value_max_grad_norm, 
                value_optimizer_fn, 
                value_optimizer_lr, 
                training_strategy_fn,
                evaluation_strategy_fn,
                n_warmup_batches,
                update_value_target_every_steps,
                update_policy_target_every_steps,
                train_policy_every_steps,
                tau,
                policy_noise_ratio,
                policy_noise_clip_ratio)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    td3_results.append(result)
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent
td3_results = np.array(td3_results)
_ = BEEP()

[2Kel 00:00:01, ep 0000, ts 0000200, ar 10 -1194.6±000.0, 100 -1194.6±000.0, ex 100 0.3±0.0, ev -884.9±000.0
[2Kel 00:01:02, ep 0030, ts 0006200, ar 10 -980.7±229.8, 100 -1255.5±312.9, ex 100 0.2±0.1, ev -1217.9±363.7
[2Kel 00:02:03, ep 0055, ts 0011200, ar 10 -182.8±109.9, 100 -797.2±570.7, ex 100 0.2±0.0, ev -736.8±602.6
[2Kel 00:03:04, ep 0078, ts 0015800, ar 10 -171.2±096.6, 100 -621.2±556.1, ex 100 0.2±0.0, ev -560.4±578.6
[2Kel 00:04:06, ep 0102, ts 0020600, ar 10 -199.9±086.3, 100 -503.8±520.1, ex 100 0.2±0.0, ev -443.9±533.1
[2Kel 00:05:01, ep 0122, ts 0024600, ar 10 -277.9±089.1, 100 -269.4±237.5, ex 100 0.1±0.0, ev -192.0±212.5
--> reached_max_minutes ✕
Training complete.
Final evaluation score -138.83±81.93 in 281.81s training time, 316.41s wall-clock time.

[2Kel 00:00:00, ep 0000, ts 0000200, ar 10 -1350.2±000.0, 100 -1350.2±000.0, ex 100 0.3±0.0, ev -1885.2±000.0
[2Kel 00:01:00, ep 0028, ts 0005800, ar 10 -889.5±351.3, 100 -1264.8±370.6, ex 100 0.2±0.1, ev -1296.4

In [2]:
sac_results = []
best_agent, best_eval_score = None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'Pendulum-v0',
        'gamma': 0.99,
        'max_minutes': 5,
        'max_episodes': 500,
        'goal_mean_100_reward': -150
    }

    policy_model_fn = lambda nS, bounds: FCGP(nS, bounds, hidden_dims=(256,256))
    policy_max_grad_norm = float('inf')
    policy_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    policy_optimizer_lr = 0.0005

    value_model_fn = lambda nS, nA: FCQSA(nS, nA, hidden_dims=(256,256))
    value_max_grad_norm = float('inf')
    value_optimizer_fn = lambda net, lr: optim.Adam(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0007

    replay_buffer_fn = lambda: ReplayBuffer(m_size=1000000, batch_size=256)
    n_warmup_batches = 10
    update_target_every_steps = 1
    tau = 0.005

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()
                
    agent = SAC(replay_buffer_fn,
                policy_model_fn, 
                policy_max_grad_norm,
                policy_optimizer_fn, 
                policy_optimizer_lr,
                value_model_fn,
                value_max_grad_norm, 
                value_optimizer_fn, 
                value_optimizer_lr, 
                n_warmup_batches,
                update_target_every_steps,
                tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    sac_results.append(result)
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent
sac_results = np.array(sac_results)
_ = BEEP()

[2Kel 00:00:02, ep 0000, ts 0000200, ar 10 -904.2±000.0, 100 -904.2±000.0, ex 100 0.3±0.0, ev -1342.1±000.0
[2Kel 00:01:06, ep 0024, ts 0005000, ar 10 -1176.2±323.1, 100 -1246.6±348.4, ex 100 0.2±0.1, ev -1273.5±471.8
[2Kel 00:02:11, ep 0039, ts 0008000, ar 10 -124.1±053.0, 100 -835.6±601.3, ex 100 0.2±0.1, ev -851.8±661.8
[2Kel 00:03:12, ep 0052, ts 0010600, ar 10 -167.0±081.3, 100 -673.3±596.1, ex 100 0.2±0.1, ev -671.7±657.0
[2Kel 00:04:14, ep 0066, ts 0013400, ar 10 -193.2±078.4, 100 -565.0±572.1, ex 100 0.2±0.1, ev -562.0±622.9
[2Kel 00:05:03, ep 0077, ts 0015600, ar 10 -145.4±071.3, 100 -508.1±549.5, ex 100 0.2±0.1, ev -501.3±597.5
--> reached_max_minutes ✕
Training complete.
Final evaluation score -136.60±82.74 in 275.30s training time, 337.30s wall-clock time.

[2Kel 00:00:00, ep 0000, ts 0000200, ar 10 -1242.3±000.0, 100 -1242.3±000.0, ex 100 0.3±0.0, ev -1877.7±000.0
[2Kel 00:01:01, ep 0023, ts 0004800, ar 10 -1251.1±294.2, 100 -1318.8±285.2, ex 100 0.2±0.1, ev -1285.