In [1]:
import warnings ; warnings.filterwarnings('ignore')
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from IPython.display import display
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from itertools import cycle, count
from textwrap import wrap

import matplotlib
import subprocess
import os.path
import tempfile
import random
import base64
import pprint
import glob
import time
import json
import sys
import gym
import io
import os
import gc

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

from zoo.value_based_agents import *
from zoo.exploration_strategies import *
from zoo.utils import *
from zoo.replay_buffers import *

SEEDS = (12, 34, 56)

%matplotlib inline

In [3]:
dueling_ddqn_results = []
dueling_ddqn_agents, best_dueling_ddqn_agent_key, best_eval_score = {}, None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'CartPole-v1',
        'gamma': 1.00,
        'max_minutes': 5,
        'max_episodes': 2500,
        'goal_mean_100_reward': 475
    }
    
    # value_model_fn = lambda nS, nA: FCQ(nS, nA, hidden_dims=(512,128))
    value_model_fn = lambda nS, nA: FCDuelingQ(nS, nA, hidden_dims=(512,128))
    value_optimizer_fn = lambda net, lr: optim.RMSprop(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0005
    max_gradient_norm = float('inf')

    training_strategy_fn = lambda: EGreedyExpStrategy(init_epsilon=1.0,  
                                                      min_epsilon=0.3, 
                                                      decay_steps=20000)
    evaluation_strategy_fn = lambda: GreedyStrategy()

    replay_buffer_fn = lambda: ReplayBuffer(m_size=50000, batch_size=64)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.1

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()
    agent = DuelingDDQN(replay_buffer_fn,
                        value_model_fn,
                        value_optimizer_fn,
                        value_optimizer_lr,
                        max_gradient_norm,
                        training_strategy_fn,
                        evaluation_strategy_fn,
                        n_warmup_batches,
                        update_target_every_steps,
                        tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    dueling_ddqn_results.append(result)
    dueling_ddqn_agents[seed] = agent
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_dueling_ddqn_agent_key = seed
dueling_ddqn_results = np.array(dueling_ddqn_results)
_ = BEEP()

[2Kel 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
[2Kel 00:01:01, ep 0105, ts 005928, ar 10 127.7±075.1, 100 058.2±051.8, ex 100 0.4±0.1, ev 304.1±132.0
[2Kel 00:02:02, ep 0144, ts 013433, ar 10 276.3±099.7, 100 120.9±099.6, ex 100 0.3±0.1, ev 319.8±095.8
[2Kel 00:03:03, ep 0170, ts 020753, ar 10 396.9±106.6, 100 179.5±127.6, ex 100 0.2±0.1, ev 337.9±097.8
[2Kel 00:04:03, ep 0185, ts 028096, ar 10 500.0±000.0, 100 244.2±155.9, ex 100 0.2±0.1, ev 368.4±107.4
[2Kel 00:05:01, ep 0199, ts 034430, ar 10 433.4±138.2, 100 293.7±162.6, ex 100 0.2±0.1, ev 390.2±109.2
--> reached_max_minutes ✕
Training complete.
Final evaluation score 500.00±0.00 in 259.22s training time, 333.17s wall-clock time.

[2Kel 00:00:00, ep 0000, ts 000034, ar 10 034.0±000.0, 100 034.0±000.0, ex 100 0.6±0.0, ev 008.0±000.0
[2Kel 00:01:01, ep 0125, ts 006325, ar 10 119.4±064.1, 100 057.2±046.0, ex 100 0.4±0.1, ev 217.4±090.1
[2Kel 00:02:01, ep 0164, ts 013762,

In [None]:
per_results = []
best_agent, best_eval_score = None, float('-inf')
for seed in SEEDS:
    environment_settings = {
        'env_name': 'CartPole-v1',
        'gamma': 1.00,
        'max_minutes': 5,
        'max_episodes': 2500,
        'goal_mean_100_reward': 475
    }

    value_model_fn = lambda nS, nA: FCDuelingQ(nS, nA, hidden_dims=(512,128))
    value_optimizer_fn = lambda net, lr: optim.RMSprop(net.parameters(), lr=lr)
    value_optimizer_lr = 0.0005
    max_gradient_norm = float('inf')

    training_strategy_fn = lambda: EGreedyExpStrategy(init_epsilon=1.0,  
                                                      min_epsilon=0.3, 
                                                      decay_steps=20000)
    evaluation_strategy_fn = lambda: GreedyStrategy()

    # replay_buffer_fn = lambda: ReplayBuffer(max_size=10000, batch_size=64)
    # replay_buffer_fn = lambda: PrioritizedReplayBuffer(
    #     max_samples=10000, batch_size=64, rank_based=True, 
    #     alpha=0.6, beta0=0.1, beta_rate=0.99995)
    replay_buffer_fn = lambda: PrioritizedReplayBuffer(
        max_samples=20000, batch_size=64, rank_based=False,
        alpha=0.6, beta0=0.1, beta_rate=0.99995)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.1

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()
    agent = PER(replay_buffer_fn, 
                value_model_fn, 
                value_optimizer_fn, 
                value_optimizer_lr,
                max_gradient_norm,
                training_strategy_fn,
                evaluation_strategy_fn,
                n_warmup_batches,
                update_target_every_steps,
                tau)

    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)
    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    per_results.append(result)
    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent
per_results = np.array(per_results)
_ = BEEP()

[2Kel 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
[2Kel 00:01:00, ep 0097, ts 005200, ar 10 168.2±070.0, 100 053.1±054.3, ex 100 0.4±0.1, ev 216.1±138.3
[2Kel 00:02:02, ep 0126, ts 010084, ar 10 212.6±087.1, 100 094.3±083.8, ex 100 0.4±0.1, ev 272.8±119.0
[2Kel 00:03:05, ep 0148, ts 014623, ar 10 221.4±038.4, 100 130.7±089.5, ex 100 0.3±0.1, ev 291.9±114.3
[2Kel 00:04:05, ep 0168, ts 018433, ar 10 204.5±048.3, 100 160.5±081.5, ex 100 0.2±0.1, ev 316.2±107.6
[2Kel 00:05:01, ep 0186, ts 021730, ar 10 140.3±062.7, 100 182.8±074.9, ex 100 0.2±0.1, ev 310.4±104.4
--> reached_max_minutes ✕
Training complete.
Final evaluation score 248.43±75.75 in 271.82s training time, 316.98s wall-clock time.

[2Kel 00:00:00, ep 0000, ts 000034, ar 10 034.0±000.0, 100 034.0±000.0, ex 100 0.6±0.0, ev 008.0±000.0
[2Kel 00:01:00, ep 0106, ts 005027, ar 10 139.7±088.2, 100 048.8±051.3, ex 100 0.4±0.1, ev 217.5±118.6
[2Kel 00:02:02, ep 0126, ts 009908